Face_reg_app/FaceFeatureExtractorAPI/models/facedetector.py

"""
输入：原图
输出：图片中face框
"""

import cv2
import numpy as np
import onnxruntime as ort


class Box:
    def __init__(self, x1, y1, x2, y2, score, label=1, label_text = 'face' ,flag=True):
        self.x1 = x1
        self.y1 = y1
        self.x2 = x2
        self.y2 = y2
        self.score = score
        self.label = label
        self.label_text = label_text
        self.flag = flag

    def iou_of(self, other):
        inter_x1 = max(self.x1, other.x1)
        inter_y1 = max(self.y1, other.y1)
        inter_x2 = min(self.x2, other.x2)
        inter_y2 = min(self.y2, other.y2)
        
        if inter_x1 < inter_x2 and inter_y1 < inter_y2:
            inter_area = (inter_x2 - inter_x1 + 1.0) * (inter_y2 - inter_y1 + 1.0)
            self_area = (self.x2 - self.x1 + 1.0) * (self.y2 - self.y1 + 1.0)
            other_area = (other.x2 - other.x1 + 1.0) * (other.y2 - other.y1 + 1.0)
            union_area = self_area + other_area - inter_area
            return inter_area / union_area
        else:
            return 0
        
    def area(self):
        return (self.x2 - self.x1 + 1) * (self.y2 - self.y1 + 1)

def hard_nms(boxes, iou_threshold, topk):
    if not boxes:
        return []
    boxes.sort(key=lambda x: x.score, reverse=True)
        
    merged = [0] * len(boxes)
    output = []

    count = 0
    for i in range(len(boxes)):
        if merged[i]:
            continue
        buf = [boxes[i]]
        merged[i] = 1

        for j in range(i + 1, len(boxes)):
            if merged[j]:
                continue

            iou = boxes[i].iou_of(boxes[j])
            if iou > iou_threshold:
                merged[j] = 1
                buf.append(boxes[j])

        output.append(buf[0])

        count += 1
        if count >= topk:
            break
    return output

def blending_nms(boxes, iou_threshold, topk):
    if not boxes:
        return []
    boxes.sort(key=lambda x: x.score, reverse=True)
    merged = [0] * len(boxes)
    output = []

    count = 0
    for i in range(len(boxes)):
        if merged[i]:
            continue
        buf = [boxes[i]]
        merged[i] = 1

        for j in range(i + 1, len(boxes)):
            if merged[j]:
                continue

            iou = boxes[i].iou_of(boxes[j])
            if iou > iou_threshold:
                merged[j] = 1
                buf.append(boxes[j])

        total = sum([np.exp(box.score) for box in buf])
        rects = Box(0, 0, 0, 0, 0)
        for box in buf:
            rate = np.exp(box.score) / total
            rects.x1 += box.x1 * rate
            rects.y1 += box.y1 * rate
            rects.x2 += box.x2 * rate
            rects.y2 += box.y2 * rate
            rects.score += box.score * rate
        rects.flag = True
        output.append(rects)

        count += 1
        if count >= topk:
            break
    return output

def offset_nms(boxes, iou_threshold, topk):
    if not boxes:
        return []
    boxes.sort(key=lambda x: x.score, reverse=True)
    merged = [0] * len(boxes)
    offset = 4096.0

    for box in boxes:
        box.x1 += box.label * offset
        box.y1 += box.label * offset
        box.x2 += box.label * offset
        box.y2 += box.label * offset

    output = []
    count = 0
    for i in range(len(boxes)):
        if merged[i]:
            continue
        buf = [boxes[i]]
        merged[i] = 1

        for j in range(i + 1, len(boxes)):
            if merged[j]:
                continue

            iou = boxes[i].iou_of(boxes[j])
            if iou > iou_threshold:
                merged[j] = 1
                buf.append(boxes[j])

        output.append(buf[0])

        count += 1
        if count >= topk:
            break

    for box in output:
        box.x1 -= box.label * offset
        box.y1 -= box.label * offset
        box.x2 -= box.label * offset
        box.y2 -= box.label * offset

    return output

def draw_rectface(img, box):
    x = max(0,int(box.x1))
    y = max(0,int(box.y1))
    w = min(img.shape[1]-x, int(box.x2-x+1))
    h = min(img.shape[0]-y, int(box.y2-y+1))
    cv2.rectangle(img,(x,y),(x+w,y+h),(0,0,255),3)
    # return img

def cut_rectface(img, box):
    x = max(0,int(box.x1))
    y = max(0,int(box.y1))
    w = min(img.shape[1]-x, int(box.x2-x+1))
    h = min(img.shape[0]-y, int(box.y2-y+1))
    return img[y:y+h,x:x+w]

def normalize_inplace(mat, mean, scale):
    mat = mat.astype(np.float32)
    mat -= mean
    mat *= scale
    return mat

def create_tensor(mat, tensor_dims, memory_info_handler, data_format):
    rows, cols, channels = mat.shape
    if len(tensor_dims) != 4:
        raise RuntimeError("dims mismatch.")
    if tensor_dims[0] != 1:
        raise RuntimeError("batch != 1")

    if data_format == "CHW":
        target_height = tensor_dims[2]
        target_width = tensor_dims[3]
        target_channel = tensor_dims[1]
        # target_tensor_size = target_channel * target_height * target_width
        if target_channel != channels:
            raise RuntimeError("channel mismatch.")
        
        if target_height != rows or target_width != cols:
            print("in create_tensor, resize mat...")
            mat = cv2.resize(mat, (target_width, target_height))
        
        mat = mat.transpose(2, 0, 1)  # HWC -> CHW   # 这儿存疑。 
        mat = np.expand_dims(mat, axis=0)
        return ort.OrtValue.ortvalue_from_numpy(mat, 'cpu')   
    
    elif data_format == "HWC":
        target_height = tensor_dims[1]
        target_width = tensor_dims[2]
        target_channel = tensor_dims[3]
        target_tensor_size = target_channel * target_height * target_width
        if target_channel != channels:
            raise RuntimeError("channel mismatch.")
        
        if target_height != rows or target_width != cols:
            mat = cv2.resize(mat, (target_width, target_height))
        
        return ort.OrtValue.ortvalue_from_numpy(mat, 'cpu')

class BasicOrtHandler:
    def __init__(self, onnx_path, num_threads=1):
        self.onnx_path = onnx_path
        self.num_threads = num_threads
        self.initialize_handler()

    def initialize_handler(self):
        # self.ort_env = ort.Env(ort.logging.ERROR)
        session_options = ort.SessionOptions()
        session_options.intra_op_num_threads = self.num_threads
        session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL

        # self.ort_session = ort.InferenceSession(self.onnx_path, session_options)
        # self.memory_info_handler = ort.OrtMemoryInfo("cpu", ort.OrtAllocatorType.ORT_ARENA_ALLOCATOR)

        # Initialize session
        self.ort_session = ort.InferenceSession(self.onnx_path, session_options)
        self.memory_info_handler = ort.OrtMemoryInfo("Cpu", ort.OrtAllocatorType.ORT_ARENA_ALLOCATOR, 0, ort.OrtMemType.DEFAULT)


        self.input_node_names = [self.ort_session.get_inputs()[0].name]
        self.input_node_dims = self.ort_session.get_inputs()[0].shape  # 获取输入张量的shape 
        self.input_tensor_size = np.prod(self.input_node_dims)

        self.output_node_names = [out.name for out in self.ort_session.get_outputs()]
        self.output_node_dims = [out.shape for out in self.ort_session.get_outputs()]
        self.num_outputs = len(self.output_node_names)

    def __del__(self):
        del self.ort_session

class FaceBoxesV2(BasicOrtHandler):
    def __init__(self, onnx_path, num_threads=1):
        super().__init__(onnx_path, num_threads)
        self.mean_vals = np.array([104.0, 117.0, 123.0], dtype=np.float32)
        self.scale_vals = np.array([1.0, 1.0, 1.0], dtype=np.float32)
        self.variance = [0.1, 0.2]
        self.steps = [32, 64, 128]
        self.min_sizes = [
            [32, 64, 128],
            [256],
            [512]
        ]
        self.max_nms = 30000

    def transform(self, mat):
        canvas = cv2.resize(mat, (self.input_node_dims[3], self.input_node_dims[2]))
        canvas = normalize_inplace(canvas, self.mean_vals, self.scale_vals)
        return create_tensor(canvas, self.input_node_dims, self.memory_info_handler, "CHW")

    def detect(self, mat, score_threshold=0.35, iou_threshold=0.45, topk=300, nms_type=0):
        if mat is None or mat.size == 0:
            return
        
        img_height = float(mat.shape[0])
        img_width = float(mat.shape[1])

        # 1. make input tensor
        input_tensor = self.transform(mat)
        # 2. inference scores & boxes.
        output_tensors = self.ort_session.run(self.output_node_names, {self.input_node_names[0]: input_tensor})

        # 3. rescale & exclude.
        bbox_collection = []
        bbox_collection = self.generate_bboxes(output_tensors, score_threshold, img_height, img_width)

        # 4. hard|blend|offset nms with topk.  return detected_boxes
        return self.nms(bbox_collection, iou_threshold, topk, nms_type)
    
    def generate_bboxes(self, output_tensors, score_threshold, img_height, img_width):
        bboxes = output_tensors[0]  # e.g (1,n,4)
        probs = output_tensors[1]  # e.g (1,n,2) after softmax
        bbox_dims = self.output_node_dims[0]  # (1,n,4)
        bbox_num = bbox_dims[1]  # n = ?
        input_height = self.input_node_dims[2]  # e.g 640
        input_width = self.input_node_dims[3]  # e.g 640

        anchors = self.generate_anchors(input_height, input_width)

        num_anchors = len(anchors)
        if num_anchors != bbox_num:
            print(f"num_anchors={num_anchors} but detected bbox_num={bbox_num}")
            raise RuntimeError("mismatch num_anchors != bbox_num")

        bbox_collection = []
        count = 0
        for i in range(num_anchors):
            conf = probs[0, i, 1]
            if conf < score_threshold:
                continue  # filter first.

            # prior_cx = anchors[i].cx
            # prior_cy = anchors[i].cy
            # prior_s_kx = anchors[i].s_kx
            # prior_s_ky = anchors[i].s_ky
            prior_cx, prior_cy, prior_s_kx, prior_s_ky = anchors[i]

            dx = bboxes[0, i, 0]
            dy = bboxes[0, i, 1]
            dw = bboxes[0, i, 2]
            dh = bboxes[0, i, 3]

            cx = prior_cx + dx * self.variance[0] * prior_s_kx
            cy = prior_cy + dy * self.variance[0] * prior_s_ky
            w = prior_s_kx * np.exp(dw * self.variance[1])
            h = prior_s_ky * np.exp(dh * self.variance[1])  # norm coor (0.,1.)

            box = Box(
                x1=(cx - w / 2.0) * img_width,
                y1=(cy - h / 2.0) * img_height,
                x2=(cx + w / 2.0) * img_width,
                y2=(cy + h / 2.0) * img_height,
                score=conf,
                label=1,
                label_text="face",
                flag=True
            )
            bbox_collection.append(box)

            count += 1  # limit boxes for nms.
            if count > self.max_nms:
                break

        return bbox_collection
    
    def nms(self, input_boxes, iou_threshold, topk, nms_type):
        if nms_type == 1:
            output_boxes = blending_nms(input_boxes, iou_threshold, topk)
        elif nms_type == 2:
            output_boxes = offset_nms(input_boxes, iou_threshold, topk)
        elif nms_type == 0:
            output_boxes = hard_nms(input_boxes, iou_threshold, topk)
        else:
            raise NotImplementedError
        return output_boxes
   
    def generate_anchors(self, target_height, target_width):
        feature_maps = []
        for step in self.steps:
            feature_maps.append([
                int(np.ceil(target_height / step)),
                int(np.ceil(target_width / step))
            ])

        anchors = []
        for k, f_map in enumerate(feature_maps):
            tmp_min_sizes = self.min_sizes[k]
            f_h, f_w = f_map
            
            offset_32 = [0.0, 0.25, 0.5, 0.75]
            offset_64 = [0.0, 0.5]

            for i in range(f_h):
                for j in range(f_w):
                    for min_size in tmp_min_sizes:
                        s_kx = min_size / target_width
                        s_ky = min_size / target_height
                        
                        if min_size == 32:
                            for offset_y in offset_32:
                                for offset_x in offset_32:
                                    cx = (j + offset_x) * self.steps[k] / target_width
                                    cy = (i + offset_y) * self.steps[k] / target_height
                                    anchors.append([cx, cy, s_kx, s_ky])
                        elif min_size == 64:
                            for offset_y in offset_64:
                                for offset_x in offset_64:
                                    cx = (j + offset_x) * self.steps[k] / target_width
                                    cy = (i + offset_y) * self.steps[k] / target_height
                                    anchors.append([cx, cy, s_kx, s_ky])
                        else:
                            cx = (j + 0.5) * self.steps[k] / target_width
                            cy = (i + 0.5) * self.steps[k] / target_height
                            anchors.append([cx, cy, s_kx, s_ky])

        return anchors


# Usage example
if __name__ == "__main__":
    import sys 
    import os  
    img_path = sys.argv[1] 
    reta = FaceBoxesV2(r"./checkpoints/faceboxesv2-640x640.onnx",4)
    img = cv2.imread(img_path)
    detected_boxes = reta.detect(img)
    count = 0
    for box in detected_boxes:
        print(f"({box.x1:.3f},{box.y1:.3f},{box.x2:.3f},{box.y2:.3f})", end=" ")
        count += 1
    print("total face number:",count)

    for box in detected_boxes:
        draw_rectface(img, box)
    
    filename = os.path.basename(img_path)
    cv2.imwrite("./" + filename, img)