Python 2D & 3D 姿态估计系统 - HRNet 和 SemGCN 模型实现 - 常规

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import csv
import os
import shutil
import pickle

from PIL import Image
import torch
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision
import cv2
import numpy as np

import sys
sys.path.append('HRNet')
sys.path.append('SemGCN')
import time

# import _init_paths
import models
from config import cfg
from config import update_config
from core.inference import get_final_preds
from utils.transforms import get_affine_transform

from common.graph_utils import adj_mx_from_skeleton
from SemGCN.models.sem_gcn import SemGCN
from common.skeleton import Skeleton

from HRNetFace.stabilizer import Stabilizer

CTX = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

class Pose3D:
    def __init__(self):
        self.COCO_KEYPOINT_INDEXES = {
            0: 'nose',
            1: 'left_eye',
            2: 'right_eye',
            3: 'left_ear',
            4: 'right_ear',
            5: 'left_shoulder',
            6: 'right_shoulder',
            7: 'left_elbow',
            8: 'right_elbow',
            9: 'left_wrist',
            10: 'right_wrist',
            11: 'left_hip',
            12: 'right_hip',
            13: 'left_knee',
            14: 'right_knee',
            15: 'left_ankle',
            16: 'right_ankle'
        }

        self.COCO_INSTANCE_CATEGORY_NAMES = [
            '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
            'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
            'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
            'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
            'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
            'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
            'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
            'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
            'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
            'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
            'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
            'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
        ]

        '
        2d settings 
        '
        # transformation
        self.pose_transform = transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                    std=[0.229, 0.224, 0.225]),
            ])

        # cudnn related setting
        cudnn.benchmark = cfg.CUDNN.BENCHMARK
        torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC
        torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED

        update_config(cfg,'./HRNet/inference-config.yaml','./HRNet/pytorch/pose_coco/pose_hrnet_w32_384x288.pth')
        self.box_model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
        self.box_model.to(CTX)
        self.box_model.eval()
        self.pose_model = eval('models.'+cfg.MODEL.NAME+'.get_pose_net')(cfg, is_train=False)

        self.pose_model.load_state_dict(torch.load(cfg.TEST.MODEL_FILE), strict=False)
        self.pose_model.to(CTX)
        self.pose_model.eval()

        '
        3d settings
        '
        # load 3D pose estimation network
        self.device = torch.device('cuda')
        with open('./SemGCN/common/skeleton.pkl', 'rb') as file:
            skel = pickle.loads(file.read())
        self.adj = adj_mx_from_skeleton(skel)
        self.model_pos = SemGCN(self.adj, 128, num_layers=4, p_dropout=None,nodes_group=None).to(self.device)
        ckpt = torch.load('./SemGCN/models/ckpt_semgcn.pth.tar')
        self.model_pos.load_state_dict(ckpt['state_dict'])
        torch.set_grad_enabled(False)
        self.model_pos.eval()

        # Introduce scalar stabilizers for pose.
        self.pose_stabilizers = [Stabilizer(
            state_num=2,
            measure_num=1,
            cov_process=0.1,
            cov_measure=0.1) for _ in range(48)]


    def get_person_detection_boxes(self, img, threshold=0.5):
        pil_image = Image.fromarray(img)  # Load the image
        transform = transforms.Compose([transforms.ToTensor()])  # Defing PyTorch Transform
        transformed_img = transform(pil_image)  # Apply the transform to the image
        pred = self.box_model([transformed_img.to(CTX)])  # Pass the image to the model
        # Use the first detected person
        pred_classes = [self.COCO_INSTANCE_CATEGORY_NAMES[i]
                        for i in list(pred[0]['labels'].cpu().numpy())]  # Get the Prediction Score
        pred_boxes = [[(i[0], i[1]), (i[2], i[3])]
                    for i in list(pred[0]['boxes'].cpu().detach().numpy())]  # Bounding boxes
        pred_scores = list(pred[0]['scores'].cpu().detach().numpy())

        person_boxes = []
        # Select box has score larger than threshold and is person
        for pred_class, pred_box, pred_score in zip(pred_classes, pred_boxes, pred_scores):
            if (pred_score > threshold) and (pred_class == 'person'):
                person_boxes.append(pred_box)

        return person_boxes


    def get_pose_estimation_prediction(self,image, centers, scales):
        rotation = 0

        # pose estimation transformation
        model_inputs = []
        for center, scale in zip(centers, scales):
            trans = get_affine_transform(center, scale, rotation, cfg.MODEL.IMAGE_SIZE)
            # Crop smaller image of people
            model_input = cv2.warpAffine(
                image,
                trans,
                (int(cfg.MODEL.IMAGE_SIZE[0]), int(cfg.MODEL.IMAGE_SIZE[1])),
                flags=cv2.INTER_LINEAR)

            # hwc -> 1chw
            model_input = self.pose_transform(model_input)#.unsqueeze(0)
            model_inputs.append(model_input)
        # n * 1chw -> nchw
        model_inputs = torch.stack(model_inputs)

        # compute output heatmap
        output = self.pose_model(model_inputs.to(CTX))
        coords, _ = get_final_preds(
            cfg,
            output.cpu().detach().numpy(),
            np.asarray(centers),
            np.asarray(scales))

        return coords


    def box_to_center_scale(self,box, model_image_width, model_image_height):
        '''convert a box to center,scale information required for pose transformation
        Parameters
        ----------
        box : list of tuple
            list of length 2 with two tuples of floats representing
            bottom left and top right corner of a box
        model_image_width : int
        model_image_height : int

        Returns
        -------
        (numpy array, numpy array)
            Two numpy arrays, coordinates for the center of the box and the scale of the box
        '''
        center = np.zeros((2), dtype=np.float32)

        bottom_left_corner = box[0]
        top_right_corner = box[1]
        box_width = top_right_corner[0]-bottom_left_corner[0]
        box_height = top_right_corner[1]-bottom_left_corner[1]
        bottom_left_x = bottom_left_corner[0]
        bottom_left_y = bottom_left_corner[1]
        center[0] = bottom_left_x + box_width * 0.5
        center[1] = bottom_left_y + box_height * 0.5

        aspect_ratio = model_image_width * 1.0 / model_image_height
        pixel_std = 200

        if box_width > aspect_ratio * box_height:
            box_height = box_width * 1.0 / aspect_ratio
        elif box_width < aspect_ratio * box_height:
            box_width = box_height * aspect_ratio
        scale = np.array(
            [box_width * 1.0 / pixel_std, box_height * 1.0 / pixel_std],
            dtype=np.float32)
        if center[0] != -1:
            scale = scale * 1.25

        return center, scale


    def normalize_screen_coordinates(self,X, w, h):
        assert X.shape[-1] == 2
        # Normalize so that [0, w] is mapped to [-1, 1], while preserving the aspect ratio
        return X / w * 2 - [1, h / w]

    def convert2d(self,input_2d):
        corr_idx = [-1,12,14,16, 11,13,15, -2,-3,0, 5,7,9, 6,8,10]
        kps = []
        hip_pos = (input_2d[11]+input_2d[12])/2.0
        neck_pos = (input_2d[5]+input_2d[6])/2.0
        neck_pos[1] = neck_pos[1] - abs(input_2d[5][0]-input_2d[6][0])/6
        spine_pos = (hip_pos+neck_pos)/2.0
        for i in range(len(corr_idx)):
            if(corr_idx[i]==-1):
                kps.append(hip_pos)
            elif(corr_idx[i]==-2):
                kps.append(spine_pos)
            elif(corr_idx[i]==-3):
                kps.append(neck_pos)
            else:
                kps.append(input_2d[corr_idx[i]])
        kps = np.array(kps)
        return kps

    def infer2d(self,image_bgr):
        image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
        # Clone 2 image for person detection and pose estimation
        if cfg.DATASET.COLOR_RGB:
            image_per = image_rgb.copy()
            image_pose = image_rgb.copy()
        else:
            image_per = image_bgr.copy()
            image_pose = image_bgr.copy()

        # object detection box
        pred_boxes = self.get_person_detection_boxes(image_per, threshold=0.9)       
        # pose estimation : for multiple people
        centers = []
        scales = []

        if(len(pred_boxes)==0):
            pred_boxes = [[(0,0),(image_bgr.shape[1],image_bgr.shape[0])]]
        # for box in pred_boxes:
        box = pred_boxes[0]
        center, scale = self.box_to_center_scale(box, cfg.MODEL.IMAGE_SIZE[0], cfg.MODEL.IMAGE_SIZE[1])
        centers.append(center)
        scales.append(scale)

        pose_preds = self.get_pose_estimation_prediction(image_pose, centers, scales)
        coco2d = self.convert2d(np.squeeze(pose_preds))    
        return coco2d

    def infer3d(self,img,inputs_2d):
        inputs_2d = self.normalize_screen_coordinates(inputs_2d,img.shape[1],img.shape[0])
        inputs_2d = torch.from_numpy(inputs_2d)
        inputs_2d = inputs_2d.to(self.device)
        outputs_3d = np.squeeze(self.model_pos(inputs_2d.float()).cpu().detach().numpy())
        # Stabilize the pose.
        steady_pose = []
        pose_np = np.array(outputs_3d).flatten()
        for value, ps_stb in zip(pose_np, self.pose_stabilizers):
            ps_stb.update([value])
            steady_pose.append(ps_stb.state[0])
        outputs_3d = np.reshape(steady_pose, (-1, 3))
        return outputs_3d



def main():
    ########################## test video ##################################
    # frame_idx=0
    # vidcap = cv2.VideoCapture('jilejingtu.mp4') # D:/photo/pose/result/test.mp4
    # print('total frame:',vidcap.get(7))
    # while vidcap.isOpened():
    #     total_now = time.time()
    #     ret, img = vidcap.read()
    #     frame_idx = frame_idx+1
    #     # img = cv2.imread('./data/test_img7.jpg')    
    #     if not ret:
    #         continue
    #     start_time  = time.time()
    #     # start 2D predict
    #     kps2d = infer2d(img)
    #     # start 3D predict
    #     kps3d = infer3d(img,kps2d)
    #     # for p in kps3d:
    #     #     print('{0},{1},{2},'.format(p[0],p[1],p[2]))
    #     print('time:',time.time()-start_time)
    ################################ draw kps ###############################
    img = cv2.imread('./data/test_img7.jpg')
    pose3d_model = Pose3D()    
    # start 2D predict
    kps2d = pose3d_model.infer2d(img)
    # start 3D predict
    outputs_3d = pose3d_model.infer3d(img,kps2d)
    kps3d=np.zeros_like(outputs_3d)
    kps3d[...,0] = outputs_3d[...,2]
    kps3d[...,1] = -outputs_3d[...,0]
    kps3d[...,2] = -outputs_3d[...,1]
    if(True):
        import matplotlib.pyplot as plt
        parent = [-1,0,1,2, 0,4,5,  0,7,8,  8,10,11, 8,13,14]
        '''
        draw 2d kps
        COCO
        {0-'nose', 1-'neck', 
        2-'Rsho', 3-'Relb', 4-'Rwri',5-'Lsho', 6-'Lelb', 7-'Lwri', 
        8-'Rhip', 9-'Rkne', 10-'Rank', 11-'Lhip', 12-'Lkne', 13-'Lank', 
        14-'Leye', 15-'Reye', 16-'Lear', 17-'Rear', 18-'pt19'}

        H36M:https://user-images.githubusercontent.com/1342385/64393700-a65aa500-d020-11e9-9627-17e17a6f5790.png
        {0-'hip', 1-'RHip', 2-'RKnee', 3-'RFoot', 4-'LHip',5-'LKnee', 6-'LFoot', 
        7-'Spine', 8-'Neck', 9-'Head', 10-'LShoulder', 11-'LElbow', 12-'LWrist', 
        13-'RShoulder', 14-'RElbow', 15-'RWrist'}
        '''
        img_show = img.copy()
        for i in range(kps2d.shape[0]):
            cv2.circle(img_show,(int(kps2d[i][0]),int(kps2d[i][1])),8,(0,255,0),-1)
        for i in range(len(parent)):    
            start =  parent[i]
            if(start!=-1):
                cv2.line(img_show,(int(kps2d[start][0]),int(kps2d[start][1])),(int(kps2d[i][0]),int(kps2d[i][1])),(0,255,255),8)
        idx = 0
        cv2.circle(img_show,(int(kps2d[idx][0]),int(kps2d[idx][1])),8,(0,0,255),-1)
        plt.figure(figsize=(8,8))
        plt.imshow(img_show[...,::-1]/255.0)
        plt.axis('off')

        '''
        draw 3d kps
        '''        
        fig = plt.figure()
        ax = fig.gca(projection='3d')
        ax.scatter3D(kps3d[:,0],kps3d[:,1],kps3d[:,2],'green')
        for i in range(len(parent)):
            if(parent[i]!=-1):
                ax.plot3D(kps3d[[i,parent[i]],0], kps3d[[i,parent[i]],1], kps3d[[i,parent[i]],2], 'gray')
                
        ax.xaxis.set_tick_params(labelsize=10,colors='red')
        ax.yaxis.set_tick_params(labelsize=10,colors='green')
        ax.zaxis.set_tick_params(labelsize=10,colors='blue')        

        ax.view_init(elev=10., azim=180)
        plt.show()


if __name__ == '__main__':
    main()

请解析上面的代码内容：该代码实现了一个姿态估计的系统，包括2D姿态估计和3D姿态估计。其中，2D姿态估计使用了HRNet模型，3D姿态估计使用了SemGCN模型。具体实现细节如下：

1.初始化

定义了一个Pose3D类，该类包含了COCO_KEYPOINT_INDEXES、COCO_INSTANCE_CATEGORY_NAMES、pose_transform等变量，以及box_model、pose_model、model_pos等模型，并且将这些模型都加载到了GPU上。

2.获取人物检测框

通过传入一张图片，使用Faster R-CNN模型进行人物检测，并返回所有检测到的人的边界框。

3.2D姿态估计

使用HRNet模型对每个人的边界框进行2D姿态估计，并返回所有关键点的2D坐标。

4.3D姿态估计

使用SemGCN模型将2D姿态转换为3D姿态，并返回所有关键点的3D坐标。

5.可视化

使用matplotlib库将2D姿态和3D姿态可视化。

代码中还包含了一些辅助函数，例如box_to_center_scale()用于将边界框转换为姿态估计模型所需的中心点和缩放比例，normalize_screen_coordinates()用于将屏幕坐标归一化到[-1, 1]之间，convert2d()用于将COCO格式的2D姿态转换为H36M格式的2D姿态。

总的来说，该代码是一个完整的姿态估计系统，可以用于对图片或视频进行人体姿态估计，并提供2D和3D姿态的可视化结果。

代码中的部分细节解释：

CTX：用于指定模型运行的设备，如果GPU可用，则使用GPU，否则使用CPU。
pose_transform：用于将图像转换为姿态估计模型所需的格式，包括归一化和转换为张量。
box_model：Faster R-CNN模型，用于人物检测。
pose_model：HRNet模型，用于2D姿态估计。
model_pos：SemGCN模型，用于3D姿态估计。
infer2d()：用于进行2D姿态估计的函数。
infer3d()：用于进行3D姿态估计的函数。
main()：主函数，用于测试代码，并可视化结果。

代码中的部分注释：

# import _init_paths：表示该代码使用了其他库，例如HRNet和SemGCN库，需要将这些库的路径添加到系统环境变量中。
# test video：表示代码中包含了使用视频进行姿态估计的代码，但被注释掉了。
# draw kps：表示代码中包含了对2D和3D姿态进行可视化的代码。

代码中的部分可改进的地方：

代码中的注释可以更加详细，方便理解代码的功能和逻辑。
代码中可以添加更多错误处理，例如捕获异常等。
代码可以进行优化，例如使用多线程或多进程来提高效率。

总的来说，该代码是一个功能完整的姿态估计系统，可以作为学习和参考的例子。

Python 2D & 3D 姿态估计系统 - HRNet 和 SemGCN 模型实现