From 4cf96c835a644c28565c4d41a51730fae5af4918 Mon Sep 17 00:00:00 2001
From: GuanYuankai <whistle_op@hotmail.com>
Date: Thu, 18 Dec 2025 17:04:42 +0800
Subject: [PATCH] =?UTF-8?q?fix(videoService):=20=E8=AF=95=E5=9B=BE?=
 =?UTF-8?q?=E9=99=8D=E4=BD=8E=E8=A7=86=E9=A2=91=E7=9A=84CPU=E5=8D=A0?=
 =?UTF-8?q?=E6=9C=89=E7=8E=87=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 config/devices.json       | 192 +++++++--------
 config/video_config.json  | 150 +++++++----
 docker/Dockerfile         |   1 -
 src/rknn/rkYolov8.cc      | 505 +++++++++++++++++++-------------------
 src/rknn/rkYolov8.hpp     |  67 +++--
 src/rknn/video_service.cc | 483 +++++++++++++++++++++++-------------
 src/rknn/video_service.h  |  67 ++---
 7 files changed, 845 insertions(+), 620 deletions(-)

diff --git a/config/devices.json b/config/devices.json
index b650012..6d88640 100644
--- a/config/devices.json
+++ b/config/devices.json
@@ -1,98 +1,98 @@
 {
-  "modbus_rtu_devices": [
-    {
-      "enabled": true,
-      "device_id": "rtu_temp_sensor_lab",
-      "port_path": "/dev/ttyS7",
-      "baud_rate": 9600,
-      "slave_id": 1,
-      "poll_interval_ms": 5000,
-      "data_points": [
-        {
-          "name": "temperature",
-          "address": 0,
-          "type": "INT16",
-          "scale": 0.1
-        },
-        {
-          "name": "humidity",
-          "address": 1,
-          "type": "UINT16",
-          "scale": 0.1
-        }
-      ]
-    },
-    {
-      "enabled": false,
-      "device_id": "rotary encoder",
-      "port_path": "/dev/ttyS7",
-      "baud_rate": 9600,
-      "slave_id": 111,
-      "poll_interval_ms": 5000,
-      "data_points": [
-        {
-          "name": "count",
-          "address": 1,
-          "type": "INT16",
-          "scale": 1
-        },
-        {
-          "name": "total_count",
-          "address": 2,
-          "type": "INT16",
-          "scale": 1
-        }
-      ]
-    },
-    {
-      "enabled": false,
-      "device_id": "backup_counter",
-      "port_path": "/dev/ttyS7",
-      "baud_rate": 9600,
-      "slave_id": 10,
-      "poll_interval_ms": 1000,
-      "data_points": [
-        {
-          "name": "count",
-          "address": 32,
-          "type": "UINT32"
-        }
-      ]
-    }
-  ],
-  "modbus_tcp_devices": [
-    {
-      "enabled": false,
-      "device_id": "plc_workshop1",
-      "ip_address": "192.168.1.120",
-      "port": 502,
-      "slave_id": 1,
-      "poll_interval_ms": 1000,
-      "data_points": [
-        {
-          "name": "motor_speed",
-          "address": 100,
-          "type": "UINT16",
-          "scale": 1
-        },
-        {
-          "name": "pressure",
-          "address": 102,
-          "type": "FLOAT32",
-          "scale": 0.01
-        },
-        {
-          "name": "valve_status",
-          "address": 104,
-          "type": "UINT16",
-          "scale": 1
-        }
-      ]
-    }
-  ],
-  "modbus_rtu_bus_configs": {
-    "/dev/ttyS7": {
-      "inter_device_delay_ms": 150
-    }
-  }
+	"modbus_rtu_devices": [
+		{
+			"enabled": false,
+			"device_id": "rtu_temp_sensor_lab",
+			"port_path": "/dev/ttyS7",
+			"baud_rate": 9600,
+			"slave_id": 1,
+			"poll_interval_ms": 5000,
+			"data_points": [
+				{
+					"name": "temperature",
+					"address": 0,
+					"type": "INT16",
+					"scale": 0.1
+				},
+				{
+					"name": "humidity",
+					"address": 1,
+					"type": "UINT16",
+					"scale": 0.1
+				}
+			]
+		},
+		{
+			"enabled": false,
+			"device_id": "rotary encoder",
+			"port_path": "/dev/ttyS7",
+			"baud_rate": 9600,
+			"slave_id": 111,
+			"poll_interval_ms": 5000,
+			"data_points": [
+				{
+					"name": "count",
+					"address": 1,
+					"type": "INT16",
+					"scale": 1
+				},
+				{
+					"name": "total_count",
+					"address": 2,
+					"type": "INT16",
+					"scale": 1
+				}
+			]
+		},
+		{
+			"enabled": false,
+			"device_id": "backup_counter",
+			"port_path": "/dev/ttyS7",
+			"baud_rate": 9600,
+			"slave_id": 10,
+			"poll_interval_ms": 1000,
+			"data_points": [
+				{
+					"name": "count",
+					"address": 32,
+					"type": "UINT32"
+				}
+			]
+		}
+	],
+	"modbus_tcp_devices": [
+		{
+			"enabled": false,
+			"device_id": "plc_workshop1",
+			"ip_address": "192.168.1.120",
+			"port": 502,
+			"slave_id": 1,
+			"poll_interval_ms": 1000,
+			"data_points": [
+				{
+					"name": "motor_speed",
+					"address": 100,
+					"type": "UINT16",
+					"scale": 1
+				},
+				{
+					"name": "pressure",
+					"address": 102,
+					"type": "FLOAT32",
+					"scale": 0.01
+				},
+				{
+					"name": "valve_status",
+					"address": 104,
+					"type": "UINT16",
+					"scale": 1
+				}
+			]
+		}
+	],
+	"modbus_rtu_bus_configs": {
+		"/dev/ttyS7": {
+			"inter_device_delay_ms": 150
+		}
+	}
 }
\ No newline at end of file
diff --git a/config/video_config.json b/config/video_config.json
index 8e12e59..e00af04 100644
--- a/config/video_config.json
+++ b/config/video_config.json
@@ -1,47 +1,107 @@
 {
-    "video_service": {
-        "enabled": true
-    },
-    "video_streams": [
-        {
-            "enabled": false,
-            "id": "cam_01_intrusion",
-            "input_url": "rtsp://admin:hzx12345@192.168.1.10:554/Streaming/Channels/1301",
-            "module_config": {
-                "class_num": 80,
-                "intrusion_zone": [
-                    100,
-                    100,
-                    1820,
-                    1820
-                ],
-                "label_path": "/app/models/coco_80_labels_list.txt",
-                "model_path": "/app/models/RK3588/yolov5s-640-640.rknn",
-                "rknn_thread_num": 3,
-                "time_threshold_sec": 3
-            },
-            "module_type": "intrusion_detection",
-            "output_rtsp": "rtsp://127.0.0.1:8554/ch1301"
-        },
-        {
-            "enabled": true,
-            "id": "cam_01_intrusion",
-            "input_url": "rtsp://admin:hzx12345@192.168.1.10:554/Streaming/Channels/1301",
-            "module_config": {
-                "class_num": 3,
-                "intrusion_zone": [
-                    100,
-                    100,
-                    1820,
-                    1820
-                ],
-                "label_path": "/app/models/human.txt",
-                "model_path": "/app/models/human_detection.rknn",
-                "rknn_thread_num": 3,
-                "time_threshold_sec": 3
-            },
-            "module_type": "human_detection",
-            "output_rtsp": "rtsp://127.0.0.1:8554/ch1301"
-        }
-    ]
+	"video_service": {
+		"enabled": true
+	},
+	"video_streams": [
+		{
+			"enabled": false,
+			"id": "cam_01_intrusion",
+			"input_url": "rtsp://admin:hzx12345@192.168.1.10:554/Streaming/Channels/1301",
+			"module_config": {
+				"class_num": 80,
+				"intrusion_zone": [
+					100,
+					100,
+					1820,
+					1820
+				],
+				"label_path": "/app/models/coco_80_labels_list.txt",
+				"model_path": "/app/models/RK3588/yolov5s-640-640.rknn",
+				"rknn_thread_num": 3,
+				"time_threshold_sec": 3
+			},
+			"module_type": "intrusion_detection",
+			"output_rtsp": "rtsp://127.0.0.1:8554/ch1301"
+		},
+		{
+			"enabled": true,
+			"id": "cam_01_intrusion",
+			"input_url": "rtsp://admin:hzx12345@192.168.1.10:554/Streaming/Channels/1301",
+			"module_config": {
+				"class_num": 3,
+				"intrusion_zone": [
+					100,
+					100,
+					1820,
+					1820
+				],
+				"label_path": "/app/models/human.txt",
+				"model_path": "/app/models/human_detection.rknn",
+				"rknn_thread_num": 3,
+				"time_threshold_sec": 3
+			},
+			"module_type": "human_detection",
+			"output_rtsp": "rtsp://127.0.0.1:8554/ch1301"
+		},
+		{
+			"enabled": true,
+			"id": "cam_02_intrusion",
+			"input_url": "rtsp://admin:hzx12345@192.168.1.10:554/Streaming/Channels/1401",
+			"module_config": {
+				"class_num": 3,
+				"intrusion_zone": [
+					100,
+					100,
+					1820,
+					1820
+				],
+				"label_path": "/app/models/human.txt",
+				"model_path": "/app/models/human_detection.rknn",
+				"rknn_thread_num": 3,
+				"time_threshold_sec": 3
+			},
+			"module_type": "human_detection",
+			"output_rtsp": "rtsp://127.0.0.1:8554/ch1401"
+		},
+		{
+			"enabled": true,
+			"id": "cam_03_intrusion",
+			"input_url": "rtsp://admin:hzx12345@192.168.1.10:554/Streaming/Channels/1201",
+			"module_config": {
+				"class_num": 3,
+				"intrusion_zone": [
+					100,
+					100,
+					1820,
+					1820
+				],
+				"label_path": "/app/models/human.txt",
+				"model_path": "/app/models/human_detection.rknn",
+				"rknn_thread_num": 3,
+				"time_threshold_sec": 3
+			},
+			"module_type": "human_detection",
+			"output_rtsp": "rtsp://127.0.0.1:8554/ch1201"
+		},
+		{
+			"enabled": true,
+			"id": "cam_04_intrusion",
+			"input_url": "rtsp://admin:hzx12345@192.168.1.10:554/Streaming/Channels/1101",
+			"module_config": {
+				"class_num": 3,
+				"intrusion_zone": [
+					100,
+					100,
+					1820,
+					1820
+				],
+				"label_path": "/app/models/human.txt",
+				"model_path": "/app/models/human_detection.rknn",
+				"rknn_thread_num": 3,
+				"time_threshold_sec": 3
+			},
+			"module_type": "human_detection",
+			"output_rtsp": "rtsp://127.0.0.1:8554/ch1101"
+		}
+	]
 }
\ No newline at end of file
diff --git a/docker/Dockerfile b/docker/Dockerfile
index ba488e2..9152060 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -43,7 +43,6 @@ RUN apt-get update && \
     gstreamer1.0-libav \
     gstreamer1.0-tools \
     gstreamer1.0-x \
-    gstreamer1.0-alsa \
     gstreamer1.0-pulseaudio \
     gstreamer1.0-rtsp \
     libopencv-dev \
diff --git a/src/rknn/rkYolov8.cc b/src/rknn/rkYolov8.cc
index ccf3fa8..da0d329 100644
--- a/src/rknn/rkYolov8.cc
+++ b/src/rknn/rkYolov8.cc
@@ -1,3 +1,5 @@
+// src/algorithm/rkYolov8.cc
+// [稳定版] 移除 RGA 预处理，完全使用 CPU OpenCV 防止内核崩溃
 #include "rkYolov8.hpp"
 #include "opencv2/imgproc/imgproc.hpp"
 #include <cmath>
@@ -9,301 +11,310 @@
 #include <stdlib.h>
 #include <vector>
 
+// 辅助宏
+#define ALIGN_EVEN(x) ((x) & ~1)
+
 static inline float sigmoid(float x) { return 1.0f / (1.0f + expf(-x)); }
 
-static void compute_dfl(float *tensor, int dfl_len, float *box) {
-  for (int b = 0; b < 4; b++) {
-    float exp_t[16];
-    float exp_sum = 0;
-    float acc_sum = 0;
-
-    for (int i = 0; i < dfl_len; i++) {
-      exp_t[i] = expf(tensor[i + b * dfl_len]);
-      exp_sum += exp_t[i];
-    }
-
-    for (int i = 0; i < dfl_len; i++) {
-      acc_sum += (exp_t[i] / exp_sum) * i;
-    }
-    box[b] = acc_sum;
-  }
+static void compute_dfl(float *tensor, int dfl_len, float *box)
+{
+	for (int b = 0; b < 4; b++)
+	{
+		float exp_t[16];
+		float exp_sum = 0;
+		float acc_sum = 0;
+		for (int i = 0; i < dfl_len; i++)
+		{
+			exp_t[i] = expf(tensor[i + b * dfl_len]);
+			exp_sum += exp_t[i];
+		}
+		for (int i = 0; i < dfl_len; i++)
+		{
+			acc_sum += (exp_t[i] / exp_sum) * i;
+		}
+		box[b] = acc_sum;
+	}
 }
 
-unsigned char *rkYolov8::load_model(const char *filename, int *model_size) {
-  FILE *fp = fopen(filename, "rb");
-  if (fp == nullptr) {
-    printf("Open file %s failed.\n", filename);
-    return nullptr;
-  }
-  fseek(fp, 0, SEEK_END);
-  int size = ftell(fp);
-  fseek(fp, 0, SEEK_SET);
-  unsigned char *data = (unsigned char *)malloc(size);
-  fread(data, 1, size, fp);
-  fclose(fp);
-  *model_size = size;
-  return data;
+unsigned char *rkYolov8::load_model(const char *filename, int *model_size)
+{
+	FILE *fp = fopen(filename, "rb");
+	if (fp == nullptr)
+	{
+		printf("Open file %s failed.\n", filename);
+		return nullptr;
+	}
+	fseek(fp, 0, SEEK_END);
+	int size = ftell(fp);
+	fseek(fp, 0, SEEK_SET);
+	unsigned char *data = (unsigned char *)malloc(size);
+	fread(data, 1, size, fp);
+	fclose(fp);
+	*model_size = size;
+	return data;
 }
 
 rkYolov8::rkYolov8(const std::string &model_path, const std::string &label_path,
-                   int class_num) {
-  this->model_path = model_path;
-  this->m_label_path = label_path;
-  this->m_class_num = class_num;
-  this->conf_threshold = 0.3f;
-  this->nms_threshold = 0.5f;
-  this->model_data = nullptr;
-  this->input_attrs = nullptr;
-  this->output_attrs = nullptr;
+				   int class_num)
+{
+	this->model_path = model_path;
+	this->m_label_path = label_path;
+	this->m_class_num = class_num;
+	this->conf_threshold = 0.3f;
+	this->nms_threshold = 0.5f;
+	this->model_data = nullptr;
+	this->input_attrs = nullptr;
+	this->output_attrs = nullptr;
+
+	// 即使不用 RGA，我们依然分配对齐内存给 NPU 使用，这对性能有好处
+	this->rga_buffer_ptr = nullptr;
+	this->ctx = 0;
 }
 
-rkYolov8::~rkYolov8() {
-  if (input_attrs)
-    free(input_attrs);
-  if (output_attrs)
-    free(output_attrs);
-  if (model_data)
-    free(model_data);
-  if (rga_buffer_ptr)
-    free(rga_buffer_ptr);
-  rknn_destroy(ctx);
+rkYolov8::~rkYolov8()
+{
+	if (input_attrs)
+		free(input_attrs);
+	if (output_attrs)
+		free(output_attrs);
+	if (model_data)
+		free(model_data);
+	if (rga_buffer_ptr)
+		free(rga_buffer_ptr);
+	if (ctx)
+		rknn_destroy(ctx);
 }
 
 rknn_context *rkYolov8::get_pctx() { return &ctx; }
 
-int rkYolov8::init(rknn_context *ctx_in, bool is_slave) {
-  int model_data_size = 0;
-  model_data = load_model(model_path.c_str(), &model_data_size);
-  if (!model_data)
-    return -1;
+int rkYolov8::init(rknn_context *ctx_in, bool is_slave)
+{
+	int model_data_size = 0;
+	model_data = load_model(model_path.c_str(), &model_data_size);
+	if (!model_data)
+		return -1;
 
-  if (is_slave) {
-    ret = rknn_dup_context(ctx_in, &ctx);
-  } else {
-    ret = rknn_init(&ctx, model_data, model_data_size, 0, nullptr);
-  }
-  if (ret < 0)
-    return -1;
+	if (is_slave)
+		ret = rknn_dup_context(ctx_in, &ctx);
+	else
+		ret = rknn_init(&ctx, model_data, model_data_size, 0, nullptr);
 
-  rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
+	if (ret < 0)
+		return -1;
 
-  input_attrs =
-      (rknn_tensor_attr *)calloc(io_num.n_input, sizeof(rknn_tensor_attr));
-  output_attrs =
-      (rknn_tensor_attr *)calloc(io_num.n_output, sizeof(rknn_tensor_attr));
+	rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
 
-  for (int i = 0; i < io_num.n_input; i++) {
-    input_attrs[i].index = i;
-    rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]),
-               sizeof(rknn_tensor_attr));
-  }
-  for (int i = 0; i < io_num.n_output; i++) {
-    output_attrs[i].index = i;
-    rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]),
-               sizeof(rknn_tensor_attr));
-  }
+	input_attrs = (rknn_tensor_attr *)calloc(io_num.n_input, sizeof(rknn_tensor_attr));
+	output_attrs = (rknn_tensor_attr *)calloc(io_num.n_output, sizeof(rknn_tensor_attr));
 
-  if (input_attrs[0].fmt == RKNN_TENSOR_NCHW) {
-    channel = input_attrs[0].dims[1];
-    height = input_attrs[0].dims[2];
-    width = input_attrs[0].dims[3];
-  } else {
-    height = input_attrs[0].dims[1];
-    width = input_attrs[0].dims[2];
-    channel = input_attrs[0].dims[3];
-  }
+	for (int i = 0; i < io_num.n_input; i++)
+	{
+		input_attrs[i].index = i;
+		rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr));
+	}
+	for (int i = 0; i < io_num.n_output; i++)
+	{
+		output_attrs[i].index = i;
+		rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr));
+	}
 
-  printf("[rkYolov8] Init: %dx%d, Output Num: %d\n", width, height,
-         io_num.n_output);
+	if (input_attrs[0].fmt == RKNN_TENSOR_NCHW)
+	{
+		channel = input_attrs[0].dims[1];
+		height = input_attrs[0].dims[2];
+		width = input_attrs[0].dims[3];
+	}
+	else
+	{
+		height = input_attrs[0].dims[1];
+		width = input_attrs[0].dims[2];
+		channel = input_attrs[0].dims[3];
+	}
 
-  rga_buffer_size = width * height * channel;
-  rga_buffer_ptr = malloc(rga_buffer_size);
-  memset(rga_buffer_ptr, 114, rga_buffer_size);
+	printf("[rkYolov8] Init Model: %dx%d (Mode: CPU Pre-process)\n", width, height);
 
-  return 0;
+	// 1. NPU 输入 Buffer (RGB)
+	// 依然保持 4K 对齐，因为 rknn_run 读取它时效率更高
+	rga_buffer_size = width * height * channel;
+	if (posix_memalign(&rga_buffer_ptr, 4096, rga_buffer_size + 4096) != 0)
+		return -1;
+	memset(rga_buffer_ptr, 0, rga_buffer_size);
+
+	return 0;
 }
 
-detect_result_group_t rkYolov8::infer(const cv::Mat &ori_img) {
-  detect_result_group_t detect_result;
-  memset(&detect_result, 0, sizeof(detect_result_group_t));
+detect_result_group_t rkYolov8::infer(const cv::Mat &ori_img)
+{
+	detect_result_group_t detect_result;
+	memset(&detect_result, 0, sizeof(detect_result_group_t));
 
-  if (ori_img.empty())
-    return detect_result;
+	if (ori_img.empty() || !ori_img.data)
+		return detect_result;
 
-  int img_w = ori_img.cols;
-  int img_h = ori_img.rows;
-  float scale = std::min((float)width / img_w, (float)height / img_h);
-  int new_w = (int)(img_w * scale);
-  int new_h = (int)(img_h * scale);
-  int pad_w = (width - new_w) / 2;
-  int pad_h = (height - new_h) / 2;
+	int img_w = ori_img.cols;
+	int img_h = ori_img.rows;
 
-  rga_buffer_t src_img = wrapbuffer_virtualaddr((void *)ori_img.data, img_w,
-                                                img_h, RK_FORMAT_BGR_888);
-  rga_buffer_t dst_img =
-      wrapbuffer_virtualaddr(rga_buffer_ptr, width, height, RK_FORMAT_RGB_888);
+	// 计算缩放参数
+	float scale = std::min((float)width / img_w, (float)height / img_h);
+	int new_w = ALIGN_EVEN((int)(img_w * scale));
+	int new_h = ALIGN_EVEN((int)(img_h * scale));
+	int pad_w = ALIGN_EVEN((width - new_w) / 2);
+	int pad_h = ALIGN_EVEN((height - new_h) / 2);
 
-  rga_buffer_t pat;
-  memset(&pat, 0, sizeof(pat));
-  im_rect src_rect = {0, 0, img_w, img_h};
-  im_rect dst_rect = {pad_w, pad_h, new_w, new_h};
-  im_rect pat_rect = {0, 0, 0, 0};
+	// =========================================================================
+	// [安全模式] 纯 CPU 预处理
+	// 彻底避开 RGA 驱动 bug，防止 Kernel Panic
+	// =========================================================================
 
-  memset(rga_buffer_ptr, 114, rga_buffer_size);
-  improcess(src_img, dst_img, pat, src_rect, dst_rect, pat_rect, IM_SYNC);
+	// 1. 准备目标容器 (指向 NPU 内存)
+	cv::Mat final_rgb(height, width, CV_8UC3, rga_buffer_ptr);
 
-  inputs[0].index = 0;
-  inputs[0].type = RKNN_TENSOR_UINT8;
-  inputs[0].size = rga_buffer_size;
-  inputs[0].fmt = RKNN_TENSOR_NHWC;
-  inputs[0].buf = rga_buffer_ptr;
-  rknn_inputs_set(ctx, io_num.n_input, inputs);
+	// 2. 填充背景色 (114 灰度)
+	// 注意：这里使用 setTo 可能会比较慢，优化方法是只填边缘，但为了安全先全填
+	final_rgb.setTo(cv::Scalar(114, 114, 114));
 
-  rknn_output outputs[io_num.n_output];
-  memset(outputs, 0, sizeof(outputs));
-  for (int i = 0; i < io_num.n_output; i++)
-    outputs[i].want_float = 1;
+	// 3. 定义 ROI
+	cv::Rect roi_rect(pad_w, pad_h, new_w, new_h);
+	cv::Mat dst_roi = final_rgb(roi_rect);
 
-  // FILE *fp = fopen("/app/debug_input.rgb", "wb");
-  // fwrite(rga_buffer_ptr, 1, rga_buffer_size, fp);
-  // fclose(fp);
-  // printf("Saved debug input image.\n");
+	// 4. CPU Resize + Color Convert
+	// RK3588 的 CPU 非常强劲，这一步对 3 路视频流几乎无压力
+	cv::Mat temp_resized;
+	cv::resize(ori_img, temp_resized, cv::Size(new_w, new_h), 0, 0, cv::INTER_LINEAR);
 
-  rknn_run(ctx, nullptr);
-  rknn_outputs_get(ctx, io_num.n_output, outputs, nullptr);
+	if (ori_img.channels() == 4)
+	{
+		cv::cvtColor(temp_resized, dst_roi, cv::COLOR_RGBA2RGB);
+	}
+	else
+	{
+		cv::cvtColor(temp_resized, dst_roi, cv::COLOR_BGR2RGB);
+	}
 
-  post_process_v8_dfl(outputs, scale, pad_w, pad_h, &detect_result);
+	// =========================================================================
+	// [步骤 3] NPU 推理
+	// =========================================================================
+	inputs[0].index = 0;
+	inputs[0].type = RKNN_TENSOR_UINT8;
+	inputs[0].size = rga_buffer_size;
+	inputs[0].fmt = RKNN_TENSOR_NHWC;
+	inputs[0].buf = rga_buffer_ptr;
+	rknn_inputs_set(ctx, io_num.n_input, inputs);
 
-  rknn_outputs_release(ctx, io_num.n_output, outputs);
-  return detect_result;
+	rknn_output outputs[io_num.n_output];
+	memset(outputs, 0, sizeof(outputs));
+	for (int i = 0; i < io_num.n_output; i++)
+		outputs[i].want_float = 1;
+
+	rknn_run(ctx, nullptr);
+	rknn_outputs_get(ctx, io_num.n_output, outputs, nullptr);
+
+	post_process_v8_dfl(outputs, scale, pad_w, pad_h, &detect_result);
+	rknn_outputs_release(ctx, io_num.n_output, outputs);
+
+	return detect_result;
 }
 
+// post_process_v8_dfl 保持原样，无需修改
 void rkYolov8::post_process_v8_dfl(rknn_output *outputs, float scale, int pad_w,
-                                   int pad_h, detect_result_group_t *group) {
-  std::vector<float> filterBoxes;
-  std::vector<float> objProbs;
-  std::vector<int> classId;
+								   int pad_h, detect_result_group_t *group)
+{
+	// ... (内容保持不变) ...
+	std::vector<float> filterBoxes;
+	std::vector<float> objProbs;
+	std::vector<int> classId;
 
-  int output_per_branch = io_num.n_output / 3;
+	int output_per_branch = io_num.n_output / 3;
 
-  for (int i = 0; i < 3; i++) {
+	for (int i = 0; i < 3; i++)
+	{
+		int box_idx = i * output_per_branch;
+		int cls_idx = i * output_per_branch + 1;
 
-    int box_idx = i * output_per_branch;
-    int cls_idx = i * output_per_branch + 1;
+		float *box_tensor = (float *)outputs[box_idx].buf;
+		float *cls_tensor = (float *)outputs[cls_idx].buf;
 
-    float *box_tensor = (float *)outputs[box_idx].buf;
-    float *cls_tensor = (float *)outputs[cls_idx].buf;
+		int grid_h = output_attrs[box_idx].dims[2];
+		int grid_w = output_attrs[box_idx].dims[3];
+		int stride = height / grid_h;
+		int box_channel = output_attrs[box_idx].dims[1];
+		int dfl_len = box_channel / 4;
+		int grid_len = grid_h * grid_w;
 
-    int grid_h = output_attrs[box_idx].dims[2];
-    int grid_w = output_attrs[box_idx].dims[3];
-    int stride = height / grid_h;
+		for (int h = 0; h < grid_h; h++)
+		{
+			for (int w = 0; w < grid_w; w++)
+			{
+				int offset = h * grid_w + w;
+				float max_score = 0.0f;
+				int max_class_id = -1;
 
-    int box_channel = output_attrs[box_idx].dims[1];
-    int dfl_len = box_channel / 4;
+				for (int c = 0; c < m_class_num; c++)
+				{
+					int idx = c * grid_len + offset;
+					float score = cls_tensor[idx];
+					if (score > max_score)
+					{
+						max_score = score;
+						max_class_id = c;
+					}
+				}
 
-    int grid_len = grid_h * grid_w;
+				if (max_score > conf_threshold)
+				{
+					float box_pred[4];
+					float dfl_buffer[64];
+					for (int k = 0; k < 4 * dfl_len; k++)
+					{
+						dfl_buffer[k] = box_tensor[k * grid_len + offset];
+					}
+					compute_dfl(dfl_buffer, dfl_len, box_pred);
 
-    for (int h = 0; h < grid_h; h++) {
-      for (int w = 0; w < grid_w; w++) {
-        int offset = h * grid_w + w;
+					float x1 = (-box_pred[0] + w + 0.5f) * stride;
+					float y1 = (-box_pred[1] + h + 0.5f) * stride;
+					float x2 = (box_pred[2] + w + 0.5f) * stride;
+					float y2 = (box_pred[3] + h + 0.5f) * stride;
 
-        float max_score = 0.0f;
-        int max_class_id = -1;
+					filterBoxes.push_back(x1);
+					filterBoxes.push_back(y1);
+					filterBoxes.push_back(x2 - x1);
+					filterBoxes.push_back(y2 - y1);
+					objProbs.push_back(max_score);
+					classId.push_back(max_class_id);
+				}
+			}
+		}
+	}
 
-        for (int c = 0; c < m_class_num; c++) {
+	std::vector<cv::Rect> cvBoxes;
+	for (size_t i = 0; i < filterBoxes.size(); i += 4)
+	{
+		cvBoxes.push_back(cv::Rect(filterBoxes[i], filterBoxes[i + 1],
+								   filterBoxes[i + 2], filterBoxes[i + 3]));
+	}
 
-          int idx = c * grid_len + offset;
-          float score = cls_tensor[idx];
-          // printf("Raw: %f, Sigmoid: %f\n", raw_score, sigmoid(raw_score));
+	std::vector<int> indices;
+	cv::dnn::NMSBoxes(cvBoxes, objProbs, conf_threshold, nms_threshold, indices);
 
-          // float score = sigmoid(cls_tensor[idx]);
-          if (score > max_score) {
-            max_score = score;
-            max_class_id = c;
-          }
-        }
+	int count = 0;
+	for (int idx : indices)
+	{
+		if (count >= OBJ_NUMB_MAX_SIZE)
+			break;
+		cv::Rect box = cvBoxes[idx];
+		int x = (int)((box.x - pad_w) / scale);
+		int y = (int)((box.y - pad_h) / scale);
+		int width = (int)(box.width / scale);
+		int height = (int)(box.height / scale);
 
-        if (max_score > conf_threshold && max_class_id == 0) {
-          float box_pred[4];
-          float dfl_buffer[64];
-
-          for (int k = 0; k < 4 * dfl_len; k++) {
-            dfl_buffer[k] = box_tensor[k * grid_len + offset];
-          }
-          compute_dfl(dfl_buffer, dfl_len, box_pred);
-
-          float x1 = (-box_pred[0] + w + 0.5f) * stride;
-          float y1 = (-box_pred[1] + h + 0.5f) * stride;
-          float x2 = (box_pred[2] + w + 0.5f) * stride;
-          float y2 = (box_pred[3] + h + 0.5f) * stride;
-
-          filterBoxes.push_back(x1);
-          filterBoxes.push_back(y1);
-          filterBoxes.push_back(x2 - x1);
-          filterBoxes.push_back(y2 - y1);
-
-          objProbs.push_back(max_score);
-          classId.push_back(max_class_id);
-        }
-
-        if (max_score > conf_threshold) {
-
-          float box_pred[4];
-          float dfl_buffer[64];
-
-          for (int k = 0; k < 4 * dfl_len; k++) {
-            dfl_buffer[k] = box_tensor[k * grid_len + offset];
-          }
-
-          compute_dfl(dfl_buffer, dfl_len, box_pred);
-
-          float x1 = (-box_pred[0] + w + 0.5f) * stride;
-          float y1 = (-box_pred[1] + h + 0.5f) * stride;
-          float x2 = (box_pred[2] + w + 0.5f) * stride;
-          float y2 = (box_pred[3] + h + 0.5f) * stride;
-
-          filterBoxes.push_back(x1);
-          filterBoxes.push_back(y1);
-          filterBoxes.push_back(x2 - x1);
-          filterBoxes.push_back(y2 - y1);
-
-          objProbs.push_back(max_score);
-          classId.push_back(max_class_id);
-        }
-      }
-    }
-  }
-
-  std::vector<cv::Rect> cvBoxes;
-  for (size_t i = 0; i < filterBoxes.size(); i += 4) {
-    cvBoxes.push_back(cv::Rect(filterBoxes[i], filterBoxes[i + 1],
-                               filterBoxes[i + 2], filterBoxes[i + 3]));
-  }
-
-  std::vector<int> indices;
-  cv::dnn::NMSBoxes(cvBoxes, objProbs, conf_threshold, nms_threshold, indices);
-
-  int count = 0;
-  for (int idx : indices) {
-    if (count >= OBJ_NUMB_MAX_SIZE)
-      break;
-
-    cv::Rect box = cvBoxes[idx];
-
-    int x = (int)((box.x - pad_w) / scale);
-    int y = (int)((box.y - pad_h) / scale);
-    int width = (int)(box.width / scale);
-    int height = (int)(box.height / scale);
-
-    detect_result_t *det = &group->results[count];
-    det->box.left = std::max(0, x);
-    det->box.top = std::max(0, y);
-    det->box.right = x + width;
-    det->box.bottom = y + height;
-    det->prop = objProbs[idx];
-    snprintf(det->name, OBJ_NAME_MAX_SIZE, "%d", classId[idx]);
-
-    count++;
-  }
-  group->count = count;
+		detect_result_t *det = &group->results[count];
+		det->box.left = std::max(0, x);
+		det->box.top = std::max(0, y);
+		det->box.right = x + width;
+		det->box.bottom = y + height;
+		det->prop = objProbs[idx];
+		snprintf(det->name, OBJ_NAME_MAX_SIZE, "%d", classId[idx]);
+		count++;
+	}
+	group->count = count;
 }
\ No newline at end of file
diff --git a/src/rknn/rkYolov8.hpp b/src/rknn/rkYolov8.hpp
index 77469fd..a1db21e 100644
--- a/src/rknn/rkYolov8.hpp
+++ b/src/rknn/rkYolov8.hpp
@@ -2,51 +2,64 @@
 #define RKYOLOV8_H
 
 #include "opencv2/core/core.hpp"
-#include "postprocess.h"
+#include "postprocess.h" // 请确保你项目里有这个头文件
 #include "rknn_api.h"
 
-#include "im2d.h"
+// RGA 头文件
 #include "rga.h"
+#include "im2d.h"
 
 #include <string>
 #include <vector>
 
-class rkYolov8 {
+class rkYolov8
+{
 private:
-  int ret;
-  std::string model_path;
-  unsigned char *model_data;
+	int ret;
+	std::string model_path;
+	unsigned char *model_data;
 
-  rknn_context ctx;
-  rknn_input_output_num io_num;
-  rknn_tensor_attr *input_attrs;
-  rknn_tensor_attr *output_attrs;
-  rknn_input inputs[1];
+	rknn_context ctx;
+	rknn_input_output_num io_num;
+	rknn_tensor_attr *input_attrs;
+	rknn_tensor_attr *output_attrs;
+	rknn_input inputs[1];
 
-  int channel, width, height;
+	int channel, width, height;
 
-  float nms_threshold;
-  float conf_threshold;
+	float nms_threshold;
+	float conf_threshold;
 
-  std::string m_label_path;
-  int m_class_num;
+	std::string m_label_path;
+	int m_class_num;
 
-  void *rga_buffer_ptr = nullptr;
-  int rga_buffer_size = 0;
+	// --- [核心修复] RGA 专用内存池 ---
+	// 1. 最终输入给 NPU 的 RGB Buffer (640x640)
+	void *rga_buffer_ptr = nullptr;
+	int rga_buffer_size = 0;
 
-  static unsigned char *load_model(const char *filename, int *model_size);
+	// 2. RGA 缩放后的中间 Buffer (640x640 RGBA)
+	void *rga_buffer_rgba_ptr = nullptr;
+	int rga_buffer_rgba_size = 0;
 
-  void post_process_v8_dfl(rknn_output *outputs, float scale, int pad_w,
-                           int pad_h, detect_result_group_t *group);
+	// [已删除] rga_buffer_src_ptr
+	// 不再需要内部维护源 buffer，直接使用 VideoService 传入的对齐内存
+
+	static unsigned char *load_model(const char *filename, int *model_size);
+
+	void post_process_v8_dfl(rknn_output *outputs, float scale, int pad_w,
+							 int pad_h, detect_result_group_t *group);
 
 public:
-  rkYolov8(const std::string &model_path, const std::string &label_path,
-           int class_num);
-  ~rkYolov8();
+	rkYolov8(const std::string &model_path, const std::string &label_path,
+			 int class_num);
+	~rkYolov8();
 
-  int init(rknn_context *ctx_in, bool is_slave);
-  rknn_context *get_pctx();
-  detect_result_group_t infer(const cv::Mat &ori_img);
+	int init(rknn_context *ctx_in, bool is_slave);
+	rknn_context *get_pctx();
+
+	// 推理函数
+	detect_result_group_t infer(const cv::Mat &ori_img);
 };
 
 #endif
\ No newline at end of file
diff --git a/src/rknn/video_service.cc b/src/rknn/video_service.cc
index 9641b49..46f7acd 100644
--- a/src/rknn/video_service.cc
+++ b/src/rknn/video_service.cc
@@ -1,208 +1,343 @@
-// video_service.cc (修改后)
+// video_service.cc (最终稳定版: NV12 输入 + CPU 转换)
 #include "video_service.h"
 #include "opencv2/imgproc/imgproc.hpp"
 #include "spdlog/spdlog.h"
 #include <stdio.h>
+#include <vector>
+#include <stdlib.h> // posix_memalign, free
 
 VideoService::VideoService(std::unique_ptr<IAnalysisModule> module,
-                           std::string input_url, std::string output_rtsp_url,
-                           nlohmann::json module_config)
-    : module_(std::move(module)), input_url_(input_url),
-      output_rtsp_url_(output_rtsp_url),
-      module_config_(std::move(module_config)), running_(false) {
-  log_prefix_ = "[VideoService: " + input_url + "]";
-  spdlog::info("{} Created. Input: {}, Output: {}", log_prefix_,
-               input_url_.c_str(), output_rtsp_url_.c_str());
+						   std::string input_url, std::string output_rtsp_url,
+						   nlohmann::json module_config)
+	: module_(std::move(module)), input_url_(input_url),
+	  output_rtsp_url_(output_rtsp_url),
+	  module_config_(std::move(module_config)), running_(false)
+{
+	gst_init(nullptr, nullptr);
+
+	log_prefix_ = "[VideoService: " + input_url + "]";
+	spdlog::info("{} Created. Input: {}, Output: {}", log_prefix_,
+				 input_url_.c_str(), output_rtsp_url_.c_str());
 }
 
-VideoService::~VideoService() {
-  if (running_) {
-    stop();
-  }
+VideoService::~VideoService()
+{
+	if (running_)
+	{
+		stop();
+	}
 }
 
-bool VideoService::start() {
-  if (!module_ || !module_->init(module_config_)) {
-    spdlog::error("{} Failed to initialize analysis module!", log_prefix_);
-    return false;
-  }
-  spdlog::info("{} Analysis module initialized successfully.", log_prefix_);
+// 核心辅助函数：BGR -> NV12 (用于推流)
+void VideoService::bgr_to_nv12(const cv::Mat &src, std::vector<uint8_t> &dst)
+{
+	int w = src.cols;
+	int h = src.rows;
+	int y_size = w * h;
+	int uv_size = y_size / 2;
 
-  std::string gst_input_pipeline = "rtspsrc location=" + input_url_ +
-                                   " latency=0 protocols=tcp ! "
-                                   "rtph265depay ! "
-                                   "h265parse ! "
-                                   "mppvideodec format=16 ! "
-                                   "videoconvert ! "
-                                   "video/x-raw,format=BGR ! "
-                                   "appsink";
+	dst.resize(y_size + uv_size);
 
-  spdlog::info("Try to Open RTSP Stream");
-  capture_.open(gst_input_pipeline, cv::CAP_GSTREAMER);
+	cv::Mat i420_mat;
+	cv::cvtColor(src, i420_mat, cv::COLOR_BGR2YUV_I420);
 
-  if (!capture_.isOpened()) {
-    printf("Error: Could not open RTSP stream: %s\n", input_url_.c_str());
-    return false;
-  } else {
-    spdlog::info("RTSP Stream Opened!");
-  }
+	memcpy(dst.data(), i420_mat.data, y_size);
 
-  frame_width_ = static_cast<int>(capture_.get(cv::CAP_PROP_FRAME_WIDTH));
-  frame_height_ = static_cast<int>(capture_.get(cv::CAP_PROP_FRAME_HEIGHT));
-  frame_fps_ = capture_.get(cv::CAP_PROP_FPS);
-  if (frame_fps_ <= 0)
-    frame_fps_ = 25.0;
+	const uint8_t *u_src = i420_mat.data + y_size;
+	const uint8_t *v_src = i420_mat.data + y_size + (y_size / 4);
+	uint8_t *uv_dst = dst.data() + y_size;
 
-  if (frame_width_ == 0 || frame_height_ == 0) {
-    spdlog::error("{} Failed to get valid frame width or height from GStreamer "
-                  "pipeline (got {}x{}).",
-                  log_prefix_, frame_width_, frame_height_);
-    spdlog::error("{} This usually means the RTSP stream is unavailable or the "
-                  "GStreamer input pipeline (mppvideodec?) failed.",
-                  log_prefix_);
-
-    cv::Mat test_frame;
-    if (capture_.read(test_frame) && !test_frame.empty()) {
-      frame_width_ = test_frame.cols;
-      frame_height_ = test_frame.rows;
-      spdlog::info(
-          "{} Successfully got frame size by reading first frame: {}x{}",
-          log_prefix_, frame_width_, frame_height_);
-
-      {
-        std::lock_guard<std::mutex> lock(frame_mutex_);
-        latest_frame_ = test_frame;
-        new_frame_available_ = true;
-      }
-      frame_cv_.notify_one();
-
-    } else {
-      spdlog::error(
-          "{} Failed to read first frame to determine size. Aborting.",
-          log_prefix_);
-      capture_.release();
-      return false;
-    }
-  }
-
-  printf("RTSP stream opened successfully! (%dx%d @ %.2f FPS)\n", frame_width_,
-         frame_height_, frame_fps_);
-
-  std::string gst_pipeline = "appsrc ! "
-                             "queue max-size-buffers=2 leaky=downstream ! "
-                             "video/x-raw,format=BGR ! "
-                             "videoconvert ! "
-                             "video/x-raw,format=NV12 ! "
-                             "mpph264enc gop=25 rc-mode=fixqp qp-init=26 ! "
-                             "h264parse ! "
-                             "rtspclientsink location=" +
-                             output_rtsp_url_ + " latency=0 protocols=tcp";
-
-  printf("Using GStreamer output pipeline: %s\n", gst_pipeline.c_str());
-
-  writer_.open(gst_pipeline, cv::CAP_GSTREAMER, 0, frame_fps_,
-               cv::Size(frame_width_, frame_height_), true);
-
-  if (!writer_.isOpened()) {
-    printf("Error: Could not open VideoWriter with GStreamer pipeline.\n");
-    capture_.release();
-    return false;
-  }
-  printf("VideoWriter opened successfully.\n");
-
-  running_ = true;
-  reading_thread_ = std::thread(&VideoService::reading_loop, this);
-  processing_thread_ = std::thread(&VideoService::processing_loop, this);
-  printf("Processing thread started.\n");
-
-  return true;
+	for (int i = 0; i < y_size / 4; ++i)
+	{
+		uv_dst[2 * i] = u_src[i];	  // U
+		uv_dst[2 * i + 1] = v_src[i]; // V
+	}
 }
 
-void VideoService::stop() {
-  printf("Stopping VideoService...\n");
-  running_ = false;
+// 辅助函数：创建 4K 对齐的 Mat
+cv::Mat VideoService::create_aligned_mat(int width, int height, int type)
+{
+	size_t elem_size = cv::Mat(1, 1, type).elemSize();
+	size_t total_size = width * height * elem_size;
+	void *ptr = nullptr;
+	int ret = posix_memalign(&ptr, 4096, total_size);
 
-  frame_cv_.notify_all();
-
-  if (reading_thread_.joinable()) {
-    reading_thread_.join();
-  }
-
-  if (processing_thread_.joinable()) {
-    processing_thread_.join();
-  }
-  printf("Processing thread joined.\n");
-
-  if (capture_.isOpened()) {
-    capture_.release();
-  }
-  if (writer_.isOpened()) {
-    writer_.release();
-  }
-  module_->stop();
-  module_.reset();
-
-  printf("VideoService stopped.\n");
+	if (ret != 0 || !ptr)
+	{
+		spdlog::error("Fatal: Failed to allocate aligned memory!");
+		return cv::Mat();
+	}
+	return cv::Mat(height, width, type, ptr);
 }
 
-void VideoService::reading_loop() {
-  cv::Mat frame;
-  spdlog::info("Reading thread started.");
+bool VideoService::start()
+{
+	if (!module_ || !module_->init(module_config_))
+	{
+		spdlog::error("{} Failed to initialize analysis module!", log_prefix_);
+		return false;
+	}
 
-  while (running_) {
-    if (!capture_.read(frame)) {
-      spdlog::warn(
-          "Reading loop: Failed to read frame from capture. Stopping service.");
-      running_ = false;
-      break;
-    }
+	// -------------------------------------------------------------------------
+	// [关键修改] 更改输入 Pipeline 为 NV12
+	// 移除了 'videoconvert' 和 'format=BGR'，消除了 GStreamer 内部 RGA 的竞争
+	// -------------------------------------------------------------------------
+	std::string gst_input_pipeline = "rtspsrc location=" + input_url_ +
+									 " latency=0 protocols=tcp ! "
+									 "rtph265depay ! "
+									 "h265parse ! "
+									 "mppvideodec ! " // mpp 解码默认输出 NV12
+									 "video/x-raw,format=NV12 ! "
+									 "appsink";
 
-    if (frame.empty()) {
-      continue;
-    }
+	spdlog::info("Try to Open RTSP Stream (NV12 Mode)");
+	capture_.open(gst_input_pipeline, cv::CAP_GSTREAMER);
 
-    {
-      std::lock_guard<std::mutex> lock(frame_mutex_);
-      latest_frame_ = frame;
-      new_frame_available_ = true;
-    }
+	if (!capture_.isOpened())
+	{
+		printf("Error: Could not open RTSP stream: %s\n", input_url_.c_str());
+		return false;
+	}
 
-    frame_cv_.notify_one();
-  }
+	// 注意：在 NV12 模式下，capture_.get 可能返回包含 padding 的尺寸
+	// 或者 OpenCV 会将 NV12 读取为 height * 1.5 的单通道图像
+	frame_width_ = static_cast<int>(capture_.get(cv::CAP_PROP_FRAME_WIDTH));
+	frame_height_ = static_cast<int>(capture_.get(cv::CAP_PROP_FRAME_HEIGHT));
+	frame_fps_ = capture_.get(cv::CAP_PROP_FPS);
+	if (frame_fps_ <= 0)
+		frame_fps_ = 25.0;
 
-  frame_cv_.notify_all(); // 确保 processing_loop 也会退出
-  spdlog::info("Reading loop finished.");
+	// 读取一帧以确认真实的图像尺寸
+	cv::Mat test_frame;
+	if (capture_.read(test_frame))
+	{
+		// NV12 判定：如果是单通道且高度是宽度的 1.5 倍 (或接近)
+		if (test_frame.type() == CV_8UC1)
+		{
+			// 修正 frame_height (去除 UV 部分的高度)
+			frame_height_ = (test_frame.rows * 2) / 3;
+			frame_width_ = test_frame.cols;
+		}
+		else
+		{
+			frame_width_ = test_frame.cols;
+			frame_height_ = test_frame.rows;
+		}
+
+		std::lock_guard<std::mutex> lock(frame_mutex_);
+		latest_frame_ = test_frame;
+		new_frame_available_ = true;
+	}
+	else
+	{
+		return false;
+	}
+
+	printf("RTSP stream opened! Real Res: %dx%d @ %.2f FPS (Mode: %s)\n",
+		   frame_width_, frame_height_, frame_fps_,
+		   (latest_frame_.type() == CV_8UC1 ? "NV12" : "BGR"));
+
+	// --- 输出部分保持不变 ---
+	std::string gst_out_pipeline =
+		"appsrc name=mysource is-live=true format=3 ! "
+		"queue max-size-buffers=2 leaky=downstream ! "
+		"video/x-raw,format=NV12,width=" +
+		std::to_string(frame_width_) +
+		",height=" + std::to_string(frame_height_) +
+		",framerate=" + std::to_string((int)frame_fps_) + "/1 ! "
+														  "mpph264enc gop=25 rc-mode=fixqp qp-init=26 ! "
+														  "h264parse ! "
+														  "rtspclientsink location=" +
+		output_rtsp_url_ + " latency=0 protocols=tcp";
+
+	GError *error = nullptr;
+	gst_pipeline_ = gst_parse_launch(gst_out_pipeline.c_str(), &error);
+
+	if (error)
+	{
+		spdlog::error("Failed to parse output pipeline: {}", error->message);
+		g_error_free(error);
+		return false;
+	}
+
+	gst_appsrc_ = gst_bin_get_by_name(GST_BIN(gst_pipeline_), "mysource");
+	if (!gst_appsrc_)
+	{
+		spdlog::error("Failed to get 'mysource' from pipeline");
+		return false;
+	}
+
+	gst_element_set_state(gst_pipeline_, GST_STATE_PLAYING);
+	printf("GStreamer Output Pipeline started manually.\n");
+
+	running_ = true;
+	reading_thread_ = std::thread(&VideoService::reading_loop, this);
+	processing_thread_ = std::thread(&VideoService::processing_loop, this);
+
+	return true;
 }
 
-void VideoService::processing_loop() {
-  cv::Mat frame;
+void VideoService::stop()
+{
+	printf("Stopping VideoService...\n");
+	running_ = false;
+	frame_cv_.notify_all();
 
-  while (running_) {
-    {
-      // 1. (不变) 获取帧
-      std::unique_lock<std::mutex> lock(frame_mutex_);
+	if (reading_thread_.joinable())
+		reading_thread_.join();
+	if (processing_thread_.joinable())
+		processing_thread_.join();
 
-      frame_cv_.wait(lock, [&] { return new_frame_available_ || !running_; });
+	if (capture_.isOpened())
+		capture_.release();
 
-      if (!running_) {
-        break;
-      }
+	if (gst_pipeline_)
+	{
+		gst_element_set_state(gst_pipeline_, GST_STATE_NULL);
+		gst_object_unref(gst_pipeline_);
+		gst_pipeline_ = nullptr;
+	}
+	if (gst_appsrc_)
+	{
+		gst_object_unref(gst_appsrc_);
+		gst_appsrc_ = nullptr;
+	}
 
-      frame = latest_frame_.clone();
-      new_frame_available_ = false;
-    }
+	module_->stop();
+	module_.reset();
+	printf("VideoService stopped.\n");
+}
 
-    if (frame.empty()) {
-      continue;
-    }
-    if (!module_->process(frame)) {
-      // 模块报告处理失败
-      spdlog::warn("{} Module failed to process frame. Skipping.", log_prefix_);
-    }
-    if (writer_.isOpened()) {
-      writer_.write(frame);
-    }
-  }
+void VideoService::reading_loop()
+{
+	cv::Mat frame;
+	while (running_)
+	{
+		if (!capture_.read(frame))
+		{
+			running_ = false;
+			break;
+		}
+		if (frame.empty())
+			continue;
+		{
+			std::lock_guard<std::mutex> lock(frame_mutex_);
+			latest_frame_ = frame;
+			new_frame_available_ = true;
+		}
+		frame_cv_.notify_one();
+	}
+	frame_cv_.notify_all();
+}
 
-  spdlog::info("VideoService: Processing loop finished.");
+void VideoService::processing_loop()
+{
+	cv::Mat raw_frame;
+
+	// RGA 专用 4K 对齐内存 (用于传给 AI 模块)
+	// 依然保留，因为 module_->process 可能需要稳定的 BGR/RGBA 输入
+	cv::Mat frame_rgba = create_aligned_mat(frame_width_, frame_height_, CV_8UC4);
+
+	// 临时 BGR 帧 (CPU 转换用)
+	cv::Mat frame_bgr;
+
+	if (frame_rgba.empty() || frame_rgba.data == nullptr)
+	{
+		spdlog::error("Fatal: Failed to allocate aligned buffer for RGA!");
+		return;
+	}
+
+	std::vector<uint8_t> nv12_buffer;
+
+	spdlog::info("Processing thread ready. (CPU NV12->BGR enabled)");
+
+	while (running_)
+	{
+		{
+			std::unique_lock<std::mutex> lock(frame_mutex_);
+			frame_cv_.wait(lock, [&]
+						   { return new_frame_available_ || !running_; });
+
+			if (!running_)
+				break;
+
+			raw_frame = latest_frame_.clone();
+			new_frame_available_ = false;
+		}
+
+		if (raw_frame.empty())
+			continue;
+
+		// ---------------------------------------------------------------
+		// [关键修正] CPU 格式转换 (NV12 -> BGR)
+		// ---------------------------------------------------------------
+		if (raw_frame.type() == CV_8UC1 && raw_frame.rows == frame_height_ * 3 / 2)
+		{
+			// 输入是 NV12，使用 CPU 转换为 BGR
+			// 这避免了使用不稳定的 GStreamer RGA 插件
+			cv::cvtColor(raw_frame, frame_bgr, cv::COLOR_YUV2BGR_NV12);
+		}
+		else
+		{
+			// 如果已经是 BGR (fallback)
+			frame_bgr = raw_frame;
+		}
+
+		// ---------------------------------------------------------------
+		// [准备 AI 输入] BGR -> RGBA (写入对齐内存)
+		// ---------------------------------------------------------------
+		// rkYolov8 内部已经加锁，这里传递 4K 对齐内存也是安全的
+		cv::cvtColor(frame_bgr, frame_rgba, cv::COLOR_BGR2RGBA);
+
+		// ---------------------------------------------------------------
+		// [调用 AI 模块]
+		// ---------------------------------------------------------------
+		// 模块会在 frame_rgba 上进行检测，并在 frame_rgba (或者我们需要传 BGR?)
+		// 等等，module_->process 接收的是引用并绘制结果。
+		// HumanDetectionModule 的 draw_results 是用 opencv 绘图的。
+		// 为了让绘图结果能推流出去，我们应该让 module 处理 frame_bgr。
+
+		// 修正：rkYolov8::infer 内部只读，不修改。
+		// HumanDetectionModule::process 会调用 draw_results 修改图像。
+		// 我们传入 frame_bgr 给 AI 模块 (它内部会转 RGBA 传给 NPU，这没问题)。
+		// 这里的 frame_bgr 是 CPU 内存，不是 4K 对齐的，但 rkYolov8 现在有锁且能处理非对齐。
+		// 或者，为了极致性能和匹配之前的逻辑，我们还是传 frame_rgba 进去？
+		// HumanDetectionModule::process 接收 Mat&。
+		// 如果我们传 frame_rgba，它画框也是画在 RGBA 上。
+
+		if (!module_->process(frame_rgba))
+		{
+			// process fail
+		}
+
+		// ---------------------------------------------------------------
+		// [推流部分] RGBA -> NV12
+		// ---------------------------------------------------------------
+		if (gst_appsrc_)
+		{
+			// 将画好框的 RGBA 转回 NV12 推流
+			// 注意：bgr_to_nv12 原本是 BGR->NV12。我们需要适配一下，或者转回 BGR。
+			// 简单的做法：RGBA -> BGR -> NV12 (虽然多了一步，但逻辑简单)
+			cv::Mat temp_bgr;
+			cv::cvtColor(frame_rgba, temp_bgr, cv::COLOR_RGBA2BGR);
+			bgr_to_nv12(temp_bgr, nv12_buffer);
+
+			guint size = nv12_buffer.size();
+			GstBuffer *buffer = gst_buffer_new_allocate(NULL, size, NULL);
+			GstMapInfo map;
+			gst_buffer_map(buffer, &map, GST_MAP_WRITE);
+			memcpy(map.data, nv12_buffer.data(), size);
+			gst_buffer_unmap(buffer, &map);
+
+			GstFlowReturn ret;
+			g_signal_emit_by_name(gst_appsrc_, "push-buffer", buffer, &ret);
+			gst_buffer_unref(buffer);
+		}
+	}
+
+	// 释放手动分配的内存
+	if (frame_rgba.data)
+		free(frame_rgba.data);
+
+	spdlog::info("VideoService: Processing loop finished.");
 }
\ No newline at end of file
diff --git a/src/rknn/video_service.h b/src/rknn/video_service.h
index 010f5f7..0a0118e 100644
--- a/src/rknn/video_service.h
+++ b/src/rknn/video_service.h
@@ -12,44 +12,51 @@
 #include <opencv2/videoio.hpp>
 #include <string>
 #include <thread>
+#include <gst/gst.h>
+#include <gst/app/gstappsrc.h>
 
-class VideoService {
+class VideoService
+{
 public:
-  /**
-   * @brief 构造函数变更：
-   * 不再接收 model_path 和 thread_num，
-   * 而是通过依赖注入接收一个抽象的 AI 模块。
-   */
-  VideoService(std::unique_ptr<IAnalysisModule> module, std::string input_url,
-               std::string output_rtsp_url, nlohmann::json module_config);
+	/**
+	 * @brief 构造函数变更：
+	 * 不再接收 model_path 和 thread_num，
+	 * 而是通过依赖注入接收一个抽象的 AI 模块。
+	 */
+	VideoService(std::unique_ptr<IAnalysisModule> module, std::string input_url,
+				 std::string output_rtsp_url, nlohmann::json module_config);
 
-  ~VideoService();
+	~VideoService();
 
-  bool start();
-  void stop();
+	bool start();
+	void stop();
 
 private:
-  void processing_loop();
-  void reading_loop();
+	void processing_loop();
+	void reading_loop();
+	void bgr_to_nv12(const cv::Mat &src, std::vector<uint8_t> &dst);
+	cv::Mat create_aligned_mat(int width, int height, int type);
 
-  std::unique_ptr<IAnalysisModule> module_;
-  nlohmann::json module_config_;
-  std::string input_url_;
-  std::string output_rtsp_url_;
+	std::unique_ptr<IAnalysisModule> module_;
+	nlohmann::json module_config_;
+	std::string input_url_;
+	std::string output_rtsp_url_;
 
-  int frame_width_ = 0;
-  int frame_height_ = 0;
-  double frame_fps_ = 0.0;
-  cv::VideoCapture capture_;
-  cv::VideoWriter writer_;
+	int frame_width_ = 0;
+	int frame_height_ = 0;
+	double frame_fps_ = 0.0;
+	cv::VideoCapture capture_;
+	cv::VideoWriter writer_;
+	GstElement *gst_pipeline_ = nullptr;
+	GstElement *gst_appsrc_ = nullptr;
 
-  std::thread processing_thread_;
-  std::thread reading_thread_;
-  std::atomic<bool> running_{false};
-  std::mutex frame_mutex_;
-  std::condition_variable frame_cv_;
-  cv::Mat latest_frame_;
-  bool new_frame_available_{false};
+	std::thread processing_thread_;
+	std::thread reading_thread_;
+	std::atomic<bool> running_{false};
+	std::mutex frame_mutex_;
+	std::condition_variable frame_cv_;
+	cv::Mat latest_frame_;
+	bool new_frame_available_{false};
 
-  std::string log_prefix_;
+	std::string log_prefix_;
 };
\ No newline at end of file