FaceRecog/src/face_pipeline.cpp

#include "face_pipeline.h"
#include <vector>
#include <string>

// 构造函数
FacePipeline::FacePipeline(const std::string &model_dir)
    : m_env(ORT_LOGGING_LEVEL_WARNING, "FaceSDK"),
      m_memory_info(Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault))
{
    m_session_options.SetIntraOpNumThreads(4); // 使用4线程
    m_session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);

    m_initialized = LoadModels(model_dir);
    if (m_initialized)
    {
        InitMemoryAllocators();
        LOGI("FacePipeline initialized successfully.");
    }
    else
    {
        LOGE("FacePipeline initialization failed.");
    }
}

FacePipeline::~FacePipeline() {}

// (私有) 加载所有模型
bool FacePipeline::LoadModels(const std::string &model_dir)
{
    auto load_session = [&](std::unique_ptr<Ort::Session> &session, const std::string &model_name)
    {
        std::string model_path = model_dir + "/" + model_name;
        try
        {
            session = std::make_unique<Ort::Session>(m_env, model_path.c_str(), m_session_options);
            LOGI("Loaded model: %s", model_path.c_str());
        }
        catch (const Ort::Exception &e)
        {
            LOGE("Error loading model %s: %s", model_path.c_str(), e.what());
            return false;
        }
        return true;
    };

    if (!load_session(m_session_rotator, "model_gray_mobilenetv2_rotcls.onnx"))
        return false;
    if (!load_session(m_session_detector, "faceboxesv2-640x640.onnx"))
        return false;
    if (!load_session(m_session_pose_var, "fsanet-var.onnx"))
        return false;
    if (!load_session(m_session_pose_conv, "fsanet-conv.onnx"))
        return false;
    if (!load_session(m_session_landmarker1, "face_landmarker_pts5_net1.onnx"))
        return false;
    if (!load_session(m_session_landmarker2, "face_landmarker_pts5_net2.onnx"))
        return false;
    if (!load_session(m_session_recognizer, "face_recognizer.onnx"))
        return false;

    LOGI("All 7 models loaded successfully.");
    return true;
}

// (私有) 获取模型输入/输出信息
void FacePipeline::InitMemoryAllocators()
{
    // 【【【 最终修正版 v3 】】】
    auto get_io_names = [&](Ort::Session *session,
                            std::vector<const char *> &input_names,
                            std::vector<const char *> &output_names,
                            std::vector<int64_t> &input_shape,
                            const char *model_name)
    {
        input_names.clear();
        output_names.clear();
        input_shape.clear();

        size_t input_count = session->GetInputCount();
        for (size_t i = 0; i < input_count; ++i)
        {
            auto input_name_ptr = session->GetInputNameAllocated(i, m_allocator);
            if (input_name_ptr == nullptr || input_name_ptr.get() == nullptr)
            {
                LOGE("Model %s input name %zu is null!", model_name, i);
                throw std::runtime_error("Failed to get model input name");
            }
            input_names.push_back(strdup(input_name_ptr.get()));
        }

        size_t output_count = session->GetOutputCount();
        for (size_t i = 0; i < output_count; ++i)
        {
            auto output_name_ptr = session->GetOutputNameAllocated(i, m_allocator);
            if (output_name_ptr == nullptr || output_name_ptr.get() == nullptr)
            {
                LOGE("Model %s output name %zu is null!", model_name, i);
                throw std::runtime_error("Failed to get model output name");
            }
            output_names.push_back(strdup(output_name_ptr.get()));
        }

        if (input_count > 0)
        {
            auto input_type_info = session->GetInputTypeInfo(0);
            auto tensor_info = input_type_info.GetTensorTypeAndShapeInfo();
            input_shape = tensor_info.GetShape();

            if (input_shape.empty())
            {
                LOGE("Model %s input shape is empty!", model_name);
                throw std::runtime_error("Model input shape is empty");
            }

            // 【【【 修正：更详细的 shape 日志 】】】
            std::string shape_str = "[";
            for (long long dim : input_shape)
                shape_str += std::to_string(dim) + ", ";
            shape_str += "]";
            LOGI("Model %s input shape: %s", model_name, shape_str.c_str());

            if (input_shape[0] < 1)
                input_shape[0] = 1; // Set batch size to 1
        }
        else
        {
            LOGE("Model %s has no inputs!", model_name);
        }
    };

    // 为7个模型初始化
    get_io_names(m_session_rotator.get(), m_rot_input_names, m_rot_output_names, m_rot_input_shape, "Rotator");
    get_io_names(m_session_detector.get(), m_det_input_names, m_det_output_names, m_det_input_shape, "Detector");
    get_io_names(m_session_pose_var.get(), m_pose_var_input_names, m_pose_var_output_names, m_pose_var_input_shape, "PoseVar");
    get_io_names(m_session_pose_conv.get(), m_pose_conv_input_names, m_pose_conv_output_names, m_pose_conv_input_shape, "PoseConv");
    get_io_names(m_session_landmarker1.get(), m_lm1_input_names, m_lm1_output_names, m_lm1_input_shape, "Landmarker1");
    get_io_names(m_session_landmarker2.get(), m_lm2_input_names, m_lm2_output_names, m_lm2_input_shape, "Landmarker2");
    get_io_names(m_session_recognizer.get(), m_rec_input_names, m_rec_output_names, m_rec_input_shape, "Recognizer");

    // 检查 Detector 形状
    if (m_det_input_shape.size() < 4)
    {
        LOGE("Detector input shape has < 4 dimensions! Cannot generate anchors.");
        throw std::runtime_error("Detector input shape invalid");
    }
    // 【【【 修正：检查 -1 维度 】】】
    if (m_det_input_shape[2] < 0 || m_det_input_shape[3] < 0)
    {
        LOGE("Detector input shape is dynamic (H/W is -1). This is not supported by the Python logic.");
        // 我们从 Python 源码知道它是 640x640
        LOGI("Forcing detector H/W to 640x640.");
        m_det_input_shape[2] = 640;
        m_det_input_shape[3] = 640;
    }
    generate_anchors_faceboxes(m_det_input_shape[2], m_det_input_shape[3]);

    // 调整Blob缓冲区大小
    size_t max_blob_size = 0;

    // 【【【 修正：安全的 update_max 逻辑 】】】
    auto update_max = [&](const std::vector<int64_t> &shape, const char *model_name)
    {
        if (shape.size() <= 1)
        {
            return; // 忽略 (e.g., [1]) 或空 shape
        }

        size_t s = 1;
        // 从 C (dim 1) 开始循环
        for (size_t i = 1; i < shape.size(); ++i)
        {
            if (shape[i] < 0)
            {
                // 如果是动态维度 (e.g., -1)，我们不能用它来计算 max_blob_size
                LOGE("Model %s has dynamic dimension at index %zu. Skipping for max_blob_size calculation.", model_name, i);
                return; // 跳过这个模型
            }
            s *= static_cast<size_t>(shape[i]);
        }

        if (s > max_blob_size)
        {
            max_blob_size = s;
        }
    };

    update_max(m_rot_input_shape, "Rotator");
    update_max(m_det_input_shape, "Detector");
    update_max(m_pose_var_input_shape, "PoseVar");
    update_max(m_lm1_input_shape, "Landmarker1");
    update_max(m_rec_input_shape, "Recognizer");
    // (我们不调用 lm2，因为它不使用公共 blob)

    if (max_blob_size == 0)
    {
        LOGE("Max blob size is 0, something went wrong with model shape detection!");
        throw std::runtime_error("Max blob size is 0");
    }

    LOGI("Calculated max blob size: %zu", max_blob_size);
    m_blob_buffer.resize(max_blob_size);
    LOGI("m_blob_buffer resized successfully.");
}

// --- 图像预处理辅助函数 ---
void FacePipeline::image_to_blob(const cv::Mat &img, std::vector<float> &blob, const float *mean, const float *std)
{
    int channels = img.channels();
    int height = img.rows;
    int width = img.cols;

    for (int c = 0; c < channels; c++)
    {
        for (int h = 0; h < height; h++)
        {
            for (int w = 0; w < width; w++)
            {
                float val;
                if (channels == 3)
                {
                    val = static_cast<float>(img.at<cv::Vec3b>(h, w)[c]);
                }
                else
                {
                    val = static_cast<float>(img.at<uchar>(h, w));
                }
                blob[c * width * height + h * width + w] = (val - mean[c]) * std[c];
            }
        }
    }
}

Ort::Value FacePipeline::create_tensor(const std::vector<float> &blob_data, const std::vector<int64_t> &input_shape)
{
    return Ort::Value::CreateTensor<float>(m_memory_info,
                                           const_cast<float *>(blob_data.data()),
                                           blob_data.size(),
                                           input_shape.data(),
                                           input_shape.size());
}

// --- 核心管线实现 ---

bool FacePipeline::Extract(const cv::Mat &image, std::vector<float> &feature)
{
    if (!m_initialized)
    {
        LOGE("Extract failed: Pipeline is not initialized.");
        return false;
    }
    if (image.empty())
    {
        LOGE("Extract failed: Input image is empty.");
        return false;
    }

    // --- 1. 旋转检测 ---
    int rot_angle_code = RunRotation(image);
    cv::Mat upright_image;
    if (rot_angle_code >= 0)
    {
        cv::rotate(image, upright_image, rot_angle_code);
    }
    else
    {
        upright_image = image;
    }

    // --- 2. 人脸检测 ---
    std::vector<FaceBox> boxes;
    if (!RunDetection(upright_image, boxes))
    {
        LOGI("Extract failed: No face detected.");
        return false;
    }
    // (Python 使用 topk=2, NMS 后 boxes[0] 即是最佳)
    FaceBox best_box = boxes[0];

    // 裁剪人脸 (用于姿态和关键点)
    // crop_face, (assess_quality)
    // Python 的 crop_face 实现了带 padding 的裁剪
    cv::Rect face_rect_raw(best_box.x1, best_box.y1, best_box.x2 - best_box.x1, best_box.y2 - best_box.y1);
    int pad_top = std::max(0, -face_rect_raw.y);
    int pad_bottom = std::max(0, (face_rect_raw.y + face_rect_raw.height) - upright_image.rows);
    int pad_left = std::max(0, -face_rect_raw.x);
    int pad_right = std::max(0, (face_rect_raw.x + face_rect_raw.width) - upright_image.cols);

    cv::Mat face_crop_padded;
    cv::copyMakeBorder(upright_image, face_crop_padded, pad_top, pad_bottom, pad_left, pad_right, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
    cv::Rect face_rect_padded(face_rect_raw.x + pad_left, face_rect_raw.y + pad_top, face_rect_raw.width, face_rect_raw.height);
    cv::Mat face_crop = face_crop_padded(face_rect_padded);

    // --- 5. 人脸对齐 (在姿态检测前，因为姿态检测需要对齐的脸) ---
    // (assess_quality) 调用 self.pose_checker.check(aligned_face)
    // QualityOfPose.check()
    // Landmark5er.inference() -> crop_face -> resize(112, 112)
    // FaceAlign.align() -> 256x256
    //
    // **逻辑冲突**:
    // face_feature_extractor.py L345 (assess_quality) 调用 pose_checker.check(aligned_face)
    // 但 L336 (align_face) 依赖 landmarks
    // 但 L330 (extract_landmarks) 依赖 boxes
    //
    // **修正**: Python 源码 L306 `QualityOfPose` 构造函数 -> L416 `check` -> L389 `detect_angle` -> L370 `transform`
    // QualityOfPose.transform() 接收的是 *未对齐* 的脸部裁剪 (L379 canvas[ny1:ny1 + h, nx1:nx1 + w] = mat)
    // **我的 C++ 逻辑错了**。 姿态检测不需要对齐的脸，它需要 *原始裁剪*。

    // --- 3. 姿态估计 (质量过滤) ---
    FacePose pose;
    if (!RunPose(face_crop, pose))
    {
        LOGI("Extract failed: Pose estimation failed.");
        return false;
    }

    if (std::abs(pose.yaw) > m_pose_threshold || std::abs(pose.pitch) > m_pose_threshold)
    {
        LOGI("Extract failed: Face pose (Y:%.1f, P:%.1f) exceeds threshold (%.1f)", pose.yaw, pose.pitch, m_pose_threshold);
        return false;
    }

    // --- 4. 关键点检测 ---
    FaceLandmark landmark;
    if (!RunLandmark(upright_image, best_box, landmark))
    {
        LOGI("Extract failed: Landmark detection failed.");
        return false;
    }

    // --- 5. 人脸对齐 ---
    cv::Mat aligned_face = RunAlignment(upright_image, landmark);

    // --- 6. 特征提取 ---
    if (!RunRecognition(aligned_face, feature))
    {
        LOGI("Extract failed: Feature recognition failed.");
        return false;
    }

    // --- 7. 归一化 (在 RunRecognition 内部完成) ---
    LOGI("Extract success.");
    return true;
}

// --- 步骤 1: 旋转检测 (来自 face_feature_extractor.py) ---
void FacePipeline::preprocess_rotation(const cv::Mat &image, std::vector<float> &blob_data)
{
    cv::Mat gray_img, resized, cropped, gray_3d;
    cv::cvtColor(image, gray_img, cv::COLOR_BGR2GRAY);
    cv::resize(gray_img, resized, cv::Size(256, 256), 0, 0, cv::INTER_LINEAR);
    int start = (256 - 224) / 2;
    cv::Rect crop_rect(start, start, 224, 224);
    cropped = resized(crop_rect);
    cv::cvtColor(cropped, gray_3d, cv::COLOR_GRAY2BGR);

    // 归一化: / 255.0 (mean=[0,0,0], std=[1,1,1])
    const float mean[3] = {0.0f, 0.0f, 0.0f};
    const float std[3] = {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f}; // 乘以 1/255 等于除以 255
    image_to_blob(gray_3d, blob_data, mean, std);
}

int FacePipeline::RunRotation(const cv::Mat &image)
{
    preprocess_rotation(image, m_blob_buffer);
    auto input_tensor = create_tensor(m_blob_buffer, m_rot_input_shape);

    auto output_tensors = m_session_rotator->Run(Ort::RunOptions{nullptr},
                                                 m_rot_input_names.data(), &input_tensor, 1,
                                                 m_rot_output_names.data(), 1);

    float *output_data = output_tensors[0].GetTensorMutableData<float>();
    int max_index = std::distance(output_data, std::max_element(output_data, output_data + 4));

    // (correct_image_rotation)
    if (max_index == 1)
        return cv::ROTATE_90_CLOCKWISE;
    if (max_index == 2)
        return cv::ROTATE_180;
    if (max_index == 3)
        return cv::ROTATE_90_COUNTERCLOCKWISE;
    return -1;
}

// --- 步骤 2: 人脸检测 (来自 facedetector.py) ---
void FacePipeline::preprocess_detection(const cv::Mat &img, std::vector<float> &blob_data)
{
    cv::Mat resized;
    cv::resize(img, resized, cv::Size(m_det_input_shape[3], m_det_input_shape[2])); // 640x640

    // 归一化: (img - [104, 117, 123]) * 1.0
    const float mean[3] = {104.0f, 117.0f, 123.0f}; // BGR
    const float std[3] = {1.0f, 1.0f, 1.0f};
    image_to_blob(resized, blob_data, mean, std);
}

bool FacePipeline::RunDetection(const cv::Mat &image, std::vector<FaceBox> &boxes)
{
    float img_height = (float)image.rows;
    float img_width = (float)image.cols;

    preprocess_detection(image, m_blob_buffer);
    auto input_tensor = create_tensor(m_blob_buffer, m_det_input_shape);

    auto output_tensors = m_session_detector->Run(Ort::RunOptions{nullptr},
                                                  m_det_input_names.data(), &input_tensor, 1,
                                                  m_det_output_names.data(), 2); // 2 outputs!

    const float *bboxes_data = output_tensors[0].GetTensorData<float>(); // [1, N, 4]
    const float *probs_data = output_tensors[1].GetTensorData<float>();  // [1, N, 2]
    long num_anchors = output_tensors[0].GetTensorTypeAndShapeInfo().GetShape()[1];

    if (num_anchors != m_anchors.size())
    {
        LOGE("Anchor size mismatch! Expected %zu, Got %ld", m_anchors.size(), num_anchors);
        return false;
    }

    std::vector<FaceBox> bbox_collection;
    const float variance[2] = {0.1f, 0.2f}; //

    for (long i = 0; i < num_anchors; ++i)
    {
        float conf = probs_data[i * 2 + 1]; // (probs[0, i, 1])
        if (conf < m_det_threshold)
            continue;

        const Anchor &anchor = m_anchors[i];
        float dx = bboxes_data[i * 4 + 0];
        float dy = bboxes_data[i * 4 + 1];
        float dw = bboxes_data[i * 4 + 2];
        float dh = bboxes_data[i * 4 + 3];

        float cx = anchor.cx + dx * variance[0] * anchor.s_kx; //
        float cy = anchor.cy + dy * variance[0] * anchor.s_ky; //
        float w = anchor.s_kx * std::exp(dw * variance[1]);    //
        float h = anchor.s_ky * std::exp(dh * variance[1]);    //

        bbox_collection.push_back({(cx - w / 2.0f) * img_width,
                                   (cy - h / 2.0f) * img_height,
                                   (cx + w / 2.0f) * img_width,
                                   (cy + h / 2.0f) * img_height,
                                   conf});
    }

    boxes = hard_nms(bbox_collection, m_det_iou_threshold, m_det_topk); // (nms_type=0)
    return !boxes.empty();
}

void FacePipeline::generate_anchors_faceboxes(int target_height, int target_width)
{
    // (generate_anchors)
    m_anchors.clear();
    std::vector<int> steps = {32, 64, 128};
    std::vector<std::vector<int>> min_sizes = {{32, 64, 128}, {256}, {512}};
    std::vector<std::vector<int>> feature_maps;
    for (int step : steps)
    {
        feature_maps.push_back({(int)std::ceil((float)target_height / step), (int)std::ceil((float)target_width / step)});
    }

    std::vector<float> offset_32 = {0.0f, 0.25f, 0.5f, 0.75f};
    std::vector<float> offset_64 = {0.0f, 0.5f};

    for (int k = 0; k < feature_maps.size(); ++k)
    {
        auto f_map = feature_maps[k];
        auto tmp_min_sizes = min_sizes[k];
        int f_h = f_map[0];
        int f_w = f_map[1];
        for (int i = 0; i < f_h; ++i)
        {
            for (int j = 0; j < f_w; ++j)
            {
                for (int min_size : tmp_min_sizes)
                {
                    float s_kx = (float)min_size / target_width;
                    float s_ky = (float)min_size / target_height;

                    if (min_size == 32)
                    {
                        for (float offset_y : offset_32)
                            for (float offset_x : offset_32)
                                m_anchors.push_back({(j + offset_x) * steps[k] / target_width, (i + offset_y) * steps[k] / target_height, s_kx, s_ky});
                    }
                    else if (min_size == 64)
                    {
                        for (float offset_y : offset_64)
                            for (float offset_x : offset_64)
                                m_anchors.push_back({(j + offset_x) * steps[k] / target_width, (i + offset_y) * steps[k] / target_height, s_kx, s_ky});
                    }
                    else
                    {
                        m_anchors.push_back({(j + 0.5f) * steps[k] / target_width, (i + 0.5f) * steps[k] / target_height, s_kx, s_ky});
                    }
                }
            }
        }
    }
}

// --- 步骤 3: 姿态估计 (来自 imgchecker.py) ---
void FacePipeline::preprocess_pose(const cv::Mat &img, std::vector<float> &blob_data)
{
    float pad = 0.3f; //
    int h = img.rows;
    int w = img.cols;
    int nh = (int)(h + pad * h);
    int nw = (int)(w + pad * w);
    int nx1 = std::max(0, (nw - w) / 2);
    int ny1 = std::max(0, (nh - h) / 2);

    cv::Mat canvas = cv::Mat::zeros(nh, nw, CV_8UC3);
    img.copyTo(canvas(cv::Rect(nx1, ny1, w, h)));

    cv::Mat resized;
    cv::resize(canvas, resized, cv::Size(m_pose_var_input_shape[3], m_pose_var_input_shape[2])); // 64x64

    // 归一化: (img - 127.5) / 127.5
    const float mean[3] = {127.5f, 127.5f, 127.5f};
    const float std[3] = {1.0f / 127.5f, 1.0f / 127.5f, 1.0f / 127.5f};
    image_to_blob(resized, blob_data, mean, std);
}

bool FacePipeline::RunPose(const cv::Mat &face_crop, FacePose &pose)
{
    preprocess_pose(face_crop, m_blob_buffer);

    // 运行 VAR
    auto input_tensor_var = create_tensor(m_blob_buffer, m_pose_var_input_shape);
    auto output_var = m_session_pose_var->Run(Ort::RunOptions{nullptr},
                                              m_pose_var_input_names.data(), &input_tensor_var, 1,
                                              m_pose_var_output_names.data(), 1);

    // 运行 CONV (使用相同的 blob)
    auto input_tensor_conv = create_tensor(m_blob_buffer, m_pose_conv_input_shape);
    auto output_conv = m_session_pose_conv->Run(Ort::RunOptions{nullptr},
                                                m_pose_conv_input_names.data(), &input_tensor_conv, 1,
                                                m_pose_conv_output_names.data(), 1);

    const float *data_var = output_var[0].GetTensorData<float>();
    const float *data_conv = output_conv[0].GetTensorData<float>();

    // 结合 (平均)
    pose.yaw = (data_var[0] + data_conv[0]) / 2.0f;
    pose.pitch = (data_var[1] + data_conv[1]) / 2.0f;
    pose.roll = (data_var[2] + data_conv[2]) / 2.0f;
    return true;
}

// --- 步骤 4: 关键点检测 (来自 facelandmarks5er.py) ---
void FacePipeline::preprocess_landmark_net1(const cv::Mat &img, std::vector<float> &blob_data)
{
    cv::Mat resized, gray_img;
    cv::resize(img, resized, cv::Size(m_lm1_input_shape[3], m_lm1_input_shape[2])); // 112x112
    cv::cvtColor(resized, gray_img, cv::COLOR_BGR2GRAY);                            //

    // 归一化: 无 (0-255)
    const float mean[1] = {0.0f};
    const float std[1] = {1.0f};
    image_to_blob(gray_img, blob_data, mean, std);
}

// C++ 转译 facelandmarks5er.py::shape_index_process
std::vector<float> FacePipeline::shape_index_process(const Ort::Value &feat_val, const Ort::Value &pos_val)
{
    auto feat_shape = feat_val.GetTensorTypeAndShapeInfo().GetShape();
    auto pos_shape = pos_val.GetTensorTypeAndShapeInfo().GetShape();
    const float *feat_data = feat_val.GetTensorData<float>();
    const float *pos_data = pos_val.GetTensorData<float>();

    long feat_n = feat_shape[0]; // 1
    long feat_c = feat_shape[1];
    long feat_h = feat_shape[2];
    long feat_w = feat_shape[3];
    long pos_n = pos_shape[0];          // 1
    long landmark_x2 = pos_shape[1];    // 10
    int landmark_num = landmark_x2 / 2; // 5

    float m_origin[] = {112.0f, 112.0f};
    float m_origin_patch[] = {15.0f, 15.0f};

    int x_patch_h = (int)(m_origin_patch[0] * feat_h / m_origin[0] + 0.5f);
    int x_patch_w = (int)(m_origin_patch[1] * feat_w / m_origin[1] + 0.5f);
    int feat_patch_h = x_patch_h;
    int feat_patch_w = x_patch_w;

    float r_h = (feat_patch_h - 1) / 2.0f;
    float r_w = (feat_patch_w - 1) / 2.0f;

    std::vector<long> out_shape = {feat_n, feat_c, x_patch_h, (long)landmark_num, x_patch_w};
    std::vector<float> buff(feat_n * feat_c * x_patch_h * landmark_num * x_patch_w, 0.0f);

    for (int i = 0; i < landmark_num; ++i)
    {
        for (int n = 0; n < feat_n; ++n)
        {
            float y_pos = pos_data[n * landmark_x2 + 2 * i + 1];
            float x_pos = pos_data[n * landmark_x2 + 2 * i];

            int y = (int)(y_pos * (feat_h - 1) - r_h + 0.5f);
            int x = (int)(x_pos * (feat_w - 1) - r_w + 0.5f);

            for (int c = 0; c < feat_c; ++c)
            {
                for (int ph = 0; ph < feat_patch_h; ++ph)
                {
                    for (int pw = 0; pw < feat_patch_w; ++pw)
                    {
                        int y_p = y + ph;
                        int x_p = x + pw;

                        long out_idx = n * (feat_c * x_patch_h * landmark_num * x_patch_w) +
                                       c * (x_patch_h * landmark_num * x_patch_w) +
                                       ph * (landmark_num * x_patch_w) +
                                       i * (x_patch_w) +
                                       pw;

                        if (y_p < 0 || y_p >= feat_h || x_p < 0 || x_p >= feat_w)
                        {
                            buff[out_idx] = 0.0f;
                        }
                        else
                        {
                            long feat_idx = n * (feat_c * feat_h * feat_w) +
                                            c * (feat_h * feat_w) +
                                            y_p * (feat_w) +
                                            x_p;
                            buff[out_idx] = feat_data[feat_idx];
                        }
                    }
                }
            }
        }
    }
    return buff;
}

bool FacePipeline::RunLandmark(const cv::Mat &image, const FaceBox &box, FaceLandmark &landmark)
{
    // 1. 裁剪人脸
    cv::Rect face_rect_raw(box.x1, box.y1, box.x2 - box.x1, box.y2 - box.y1);
    int pad_top = std::max(0, -face_rect_raw.y);
    int pad_bottom = std::max(0, (face_rect_raw.y + face_rect_raw.height) - image.rows);
    int pad_left = std::max(0, -face_rect_raw.x);
    int pad_right = std::max(0, (face_rect_raw.x + face_rect_raw.width) - image.cols);
    cv::Mat face_crop_padded;
    cv::copyMakeBorder(image, face_crop_padded, pad_top, pad_bottom, pad_left, pad_right, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
    cv::Rect face_rect_padded(face_rect_raw.x + pad_left, face_rect_raw.y + pad_top, face_rect_raw.width, face_rect_raw.height);
    cv::Mat face_crop = face_crop_padded(face_rect_padded);

    // 2. 预处理 Net1
    preprocess_landmark_net1(face_crop, m_blob_buffer);
    auto input_tensor_net1 = create_tensor(m_blob_buffer, m_lm1_input_shape);

    // 3. 运行 Net1
    auto output_net1 = m_session_landmarker1->Run(Ort::RunOptions{nullptr},
                                                  m_lm1_input_names.data(), &input_tensor_net1, 1,
                                                  m_lm1_output_names.data(), 2); // 2 outputs

    // 4. Shape Index Process
    std::vector<float> shape_index_blob = shape_index_process(output_net1[0], output_net1[1]);

    // 5. 准备 Net2 输入
    auto input_tensor_net2 = Ort::Value::CreateTensor<float>(m_memory_info,
                                                             shape_index_blob.data(),
                                                             shape_index_blob.size(),
                                                             m_lm2_input_shape.data(),
                                                             m_lm2_input_shape.size());

    // 6. 运行 Net2
    auto output_net2 = m_session_landmarker2->Run(Ort::RunOptions{nullptr},
                                                  m_lm2_input_names.data(), &input_tensor_net2, 1,
                                                  m_lm2_output_names.data(), 1);

    // 7. 后处理
    const float *data_net1_pos = output_net1[1].GetTensorData<float>();
    const float *data_net2 = output_net2[0].GetTensorData<float>();
    auto shape_net1_pos = output_net1[1].GetTensorTypeAndShapeInfo().GetShape(); // [1, 10]
    int landmark_x2 = shape_net1_pos[1];

    float scale_x = (box.x2 - box.x1) / 112.0f;
    float scale_y = (box.y2 - box.y1) / 112.0f;

    for (int i = 0; i < 5; ++i)
    {
        float x_norm = (data_net2[i * 2 + 0] + data_net1_pos[i * 2 + 0]) * 112.0f;
        float y_norm = (data_net2[i * 2 + 1] + data_net1_pos[i * 2 + 1]) * 112.0f;

        float x = box.x1 + x_norm * scale_x;
        float y = box.y1 + y_norm * scale_y;

        x = std::max(0.01f, std::min(x, (float)image.cols - 0.01f));
        y = std::max(0.01f, std::min(y, (float)image.rows - 0.01f));
        landmark.points[i] = cv::Point2f(x, y);
    }
    return true;
}

// --- 步骤 5: 人脸对齐 (来自 facealign.py) ---
cv::Mat FacePipeline::RunAlignment(const cv::Mat &image, const FaceLandmark &landmark)
{
    // (align)
    std::vector<cv::Point2f> src_points;
    std::vector<cv::Point2f> dst_points;

    for (int i = 0; i < 5; ++i)
    {
        src_points.push_back(landmark.points[i]);
        dst_points.push_back(cv::Point2f(m_landmark_template.at<float>(i, 0),
                                         m_landmark_template.at<float>(i, 1)));
    }

    // (transformation_maker) -> estimateAffinePartial2D
    cv::Mat transform_matrix = cv::estimateAffinePartial2D(src_points, dst_points);

    cv::Mat aligned_face;
    // (spatial_transform) -> warpAffine
    // (crop_width, crop_height = 256, 256)
    cv::warpAffine(image, aligned_face, transform_matrix, m_align_output_size, cv::INTER_LINEAR);

    return aligned_face;
}

// --- 步骤 6: 特征提取 (来自 facerecoger.py) ---
void FacePipeline::preprocess_recognition(const cv::Mat &img, std::vector<float> &blob_data)
{
    cv::Mat resized, rgb_img;

    const cv::Size target_size(248, 248);

    // (resize to 248, 248)
    cv::resize(img, resized, target_size);

    // (BGR -> RGB)
    cv::cvtColor(resized, rgb_img, cv::COLOR_BGR2RGB);

    // 归一化: 无 (0-255)
    const float mean[3] = {0.0f, 0.0f, 0.0f};
    const float std[3] = {1.0f, 1.0f, 1.0f};
    image_to_blob(rgb_img, blob_data, mean, std);
}

void FacePipeline::normalize_sqrt_l2(std::vector<float> &v)
{
    // (temp_result = np.sqrt(pred_result[0]))
    double norm = 0.0;
    for (float &val : v)
    {
        val = std::sqrt(std::max(0.0f, val)); // 取 sqrt
        norm += val * val;
    }

    // (norm = temp_result / np.linalg.norm(...))
    if (norm > 1e-6)
    {
        norm = std::sqrt(norm);
        for (float &val : v)
        {
            val = static_cast<float>(val / norm);
        }
    }
}

bool FacePipeline::RunRecognition(const cv::Mat &aligned_face, std::vector<float> &feature)
{
    // 【【【 最终修正 v5 】】】

    // 1. 预处理 (这部分是正确的，它生成了 248x248 的 blob)
    preprocess_recognition(aligned_face, m_blob_buffer);

    // 2. (BUG 在这里) 我们不能使用 m_rec_input_shape (它是 [-1, -1, -1, -1])
    //    我们必须硬编码 Python 源码 (facerecoger.py) 中使用的 shape。
    const std::vector<int64_t> hardcoded_shape = {1, 3, 248, 248};

    // 3. (修正) 使用 hardcoded_shape 创建 Tensor
    auto input_tensor = create_tensor(m_blob_buffer, hardcoded_shape);

    // 4. 运行
    auto output_tensors = m_session_recognizer->Run(Ort::RunOptions{nullptr},
                                                    m_rec_input_names.data(), &input_tensor, 1,
                                                    m_rec_output_names.data(), 1);

    long feature_dim = output_tensors[0].GetTensorTypeAndShapeInfo().GetShape()[1];
    const float *output_data = output_tensors[0].GetTensorData<float>();

    feature.resize(feature_dim);
    memcpy(feature.data(), output_data, feature_dim * sizeof(float));

    // 5. 后处理 (SQRT-L2 Norm)
    normalize_sqrt_l2(feature);

    return true;
}