Python 2D & 3D 姿态估计系统 - HRNet 和 SemGCN 模型实现
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import csv
import os
import shutil
import pickle
from PIL import Image
import torch
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision
import cv2
import numpy as np
import sys
sys.path.append('HRNet')
sys.path.append('SemGCN')
import time
# import _init_paths
import models
from config import cfg
from config import update_config
from core.inference import get_final_preds
from utils.transforms import get_affine_transform
from common.graph_utils import adj_mx_from_skeleton
from SemGCN.models.sem_gcn import SemGCN
from common.skeleton import Skeleton
from HRNetFace.stabilizer import Stabilizer
CTX = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
class Pose3D:
def __init__(self):
self.COCO_KEYPOINT_INDEXES = {
0: 'nose',
1: 'left_eye',
2: 'right_eye',
3: 'left_ear',
4: 'right_ear',
5: 'left_shoulder',
6: 'right_shoulder',
7: 'left_elbow',
8: 'right_elbow',
9: 'left_wrist',
10: 'right_wrist',
11: 'left_hip',
12: 'right_hip',
13: 'left_knee',
14: 'right_knee',
15: 'left_ankle',
16: 'right_ankle'
}
self.COCO_INSTANCE_CATEGORY_NAMES = [
'__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]
'
2d settings
'
# transformation
self.pose_transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
])
# cudnn related setting
cudnn.benchmark = cfg.CUDNN.BENCHMARK
torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC
torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED
update_config(cfg,'./HRNet/inference-config.yaml','./HRNet/pytorch/pose_coco/pose_hrnet_w32_384x288.pth')
self.box_model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
self.box_model.to(CTX)
self.box_model.eval()
self.pose_model = eval('models.'+cfg.MODEL.NAME+'.get_pose_net')(cfg, is_train=False)
self.pose_model.load_state_dict(torch.load(cfg.TEST.MODEL_FILE), strict=False)
self.pose_model.to(CTX)
self.pose_model.eval()
'
3d settings
'
# load 3D pose estimation network
self.device = torch.device('cuda')
with open('./SemGCN/common/skeleton.pkl', 'rb') as file:
skel = pickle.loads(file.read())
self.adj = adj_mx_from_skeleton(skel)
self.model_pos = SemGCN(self.adj, 128, num_layers=4, p_dropout=None,nodes_group=None).to(self.device)
ckpt = torch.load('./SemGCN/models/ckpt_semgcn.pth.tar')
self.model_pos.load_state_dict(ckpt['state_dict'])
torch.set_grad_enabled(False)
self.model_pos.eval()
# Introduce scalar stabilizers for pose.
self.pose_stabilizers = [Stabilizer(
state_num=2,
measure_num=1,
cov_process=0.1,
cov_measure=0.1) for _ in range(48)]
def get_person_detection_boxes(self, img, threshold=0.5):
pil_image = Image.fromarray(img) # Load the image
transform = transforms.Compose([transforms.ToTensor()]) # Defing PyTorch Transform
transformed_img = transform(pil_image) # Apply the transform to the image
pred = self.box_model([transformed_img.to(CTX)]) # Pass the image to the model
# Use the first detected person
pred_classes = [self.COCO_INSTANCE_CATEGORY_NAMES[i]
for i in list(pred[0]['labels'].cpu().numpy())] # Get the Prediction Score
pred_boxes = [[(i[0], i[1]), (i[2], i[3])]
for i in list(pred[0]['boxes'].cpu().detach().numpy())] # Bounding boxes
pred_scores = list(pred[0]['scores'].cpu().detach().numpy())
person_boxes = []
# Select box has score larger than threshold and is person
for pred_class, pred_box, pred_score in zip(pred_classes, pred_boxes, pred_scores):
if (pred_score > threshold) and (pred_class == 'person'):
person_boxes.append(pred_box)
return person_boxes
def get_pose_estimation_prediction(self,image, centers, scales):
rotation = 0
# pose estimation transformation
model_inputs = []
for center, scale in zip(centers, scales):
trans = get_affine_transform(center, scale, rotation, cfg.MODEL.IMAGE_SIZE)
# Crop smaller image of people
model_input = cv2.warpAffine(
image,
trans,
(int(cfg.MODEL.IMAGE_SIZE[0]), int(cfg.MODEL.IMAGE_SIZE[1])),
flags=cv2.INTER_LINEAR)
# hwc -> 1chw
model_input = self.pose_transform(model_input)#.unsqueeze(0)
model_inputs.append(model_input)
# n * 1chw -> nchw
model_inputs = torch.stack(model_inputs)
# compute output heatmap
output = self.pose_model(model_inputs.to(CTX))
coords, _ = get_final_preds(
cfg,
output.cpu().detach().numpy(),
np.asarray(centers),
np.asarray(scales))
return coords
def box_to_center_scale(self,box, model_image_width, model_image_height):
'''convert a box to center,scale information required for pose transformation
Parameters
----------
box : list of tuple
list of length 2 with two tuples of floats representing
bottom left and top right corner of a box
model_image_width : int
model_image_height : int
Returns
-------
(numpy array, numpy array)
Two numpy arrays, coordinates for the center of the box and the scale of the box
'''
center = np.zeros((2), dtype=np.float32)
bottom_left_corner = box[0]
top_right_corner = box[1]
box_width = top_right_corner[0]-bottom_left_corner[0]
box_height = top_right_corner[1]-bottom_left_corner[1]
bottom_left_x = bottom_left_corner[0]
bottom_left_y = bottom_left_corner[1]
center[0] = bottom_left_x + box_width * 0.5
center[1] = bottom_left_y + box_height * 0.5
aspect_ratio = model_image_width * 1.0 / model_image_height
pixel_std = 200
if box_width > aspect_ratio * box_height:
box_height = box_width * 1.0 / aspect_ratio
elif box_width < aspect_ratio * box_height:
box_width = box_height * aspect_ratio
scale = np.array(
[box_width * 1.0 / pixel_std, box_height * 1.0 / pixel_std],
dtype=np.float32)
if center[0] != -1:
scale = scale * 1.25
return center, scale
def normalize_screen_coordinates(self,X, w, h):
assert X.shape[-1] == 2
# Normalize so that [0, w] is mapped to [-1, 1], while preserving the aspect ratio
return X / w * 2 - [1, h / w]
def convert2d(self,input_2d):
corr_idx = [-1,12,14,16, 11,13,15, -2,-3,0, 5,7,9, 6,8,10]
kps = []
hip_pos = (input_2d[11]+input_2d[12])/2.0
neck_pos = (input_2d[5]+input_2d[6])/2.0
neck_pos[1] = neck_pos[1] - abs(input_2d[5][0]-input_2d[6][0])/6
spine_pos = (hip_pos+neck_pos)/2.0
for i in range(len(corr_idx)):
if(corr_idx[i]==-1):
kps.append(hip_pos)
elif(corr_idx[i]==-2):
kps.append(spine_pos)
elif(corr_idx[i]==-3):
kps.append(neck_pos)
else:
kps.append(input_2d[corr_idx[i]])
kps = np.array(kps)
return kps
def infer2d(self,image_bgr):
image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
# Clone 2 image for person detection and pose estimation
if cfg.DATASET.COLOR_RGB:
image_per = image_rgb.copy()
image_pose = image_rgb.copy()
else:
image_per = image_bgr.copy()
image_pose = image_bgr.copy()
# object detection box
pred_boxes = self.get_person_detection_boxes(image_per, threshold=0.9)
# pose estimation : for multiple people
centers = []
scales = []
if(len(pred_boxes)==0):
pred_boxes = [[(0,0),(image_bgr.shape[1],image_bgr.shape[0])]]
# for box in pred_boxes:
box = pred_boxes[0]
center, scale = self.box_to_center_scale(box, cfg.MODEL.IMAGE_SIZE[0], cfg.MODEL.IMAGE_SIZE[1])
centers.append(center)
scales.append(scale)
pose_preds = self.get_pose_estimation_prediction(image_pose, centers, scales)
coco2d = self.convert2d(np.squeeze(pose_preds))
return coco2d
def infer3d(self,img,inputs_2d):
inputs_2d = self.normalize_screen_coordinates(inputs_2d,img.shape[1],img.shape[0])
inputs_2d = torch.from_numpy(inputs_2d)
inputs_2d = inputs_2d.to(self.device)
outputs_3d = np.squeeze(self.model_pos(inputs_2d.float()).cpu().detach().numpy())
# Stabilize the pose.
steady_pose = []
pose_np = np.array(outputs_3d).flatten()
for value, ps_stb in zip(pose_np, self.pose_stabilizers):
ps_stb.update([value])
steady_pose.append(ps_stb.state[0])
outputs_3d = np.reshape(steady_pose, (-1, 3))
return outputs_3d
def main():
########################## test video ##################################
# frame_idx=0
# vidcap = cv2.VideoCapture('jilejingtu.mp4') # D:/photo/pose/result/test.mp4
# print('total frame:',vidcap.get(7))
# while vidcap.isOpened():
# total_now = time.time()
# ret, img = vidcap.read()
# frame_idx = frame_idx+1
# # img = cv2.imread('./data/test_img7.jpg')
# if not ret:
# continue
# start_time = time.time()
# # start 2D predict
# kps2d = infer2d(img)
# # start 3D predict
# kps3d = infer3d(img,kps2d)
# # for p in kps3d:
# # print('{0},{1},{2},'.format(p[0],p[1],p[2]))
# print('time:',time.time()-start_time)
################################ draw kps ###############################
img = cv2.imread('./data/test_img7.jpg')
pose3d_model = Pose3D()
# start 2D predict
kps2d = pose3d_model.infer2d(img)
# start 3D predict
outputs_3d = pose3d_model.infer3d(img,kps2d)
kps3d=np.zeros_like(outputs_3d)
kps3d[...,0] = outputs_3d[...,2]
kps3d[...,1] = -outputs_3d[...,0]
kps3d[...,2] = -outputs_3d[...,1]
if(True):
import matplotlib.pyplot as plt
parent = [-1,0,1,2, 0,4,5, 0,7,8, 8,10,11, 8,13,14]
'''
draw 2d kps
COCO
{0-'nose', 1-'neck',
2-'Rsho', 3-'Relb', 4-'Rwri',5-'Lsho', 6-'Lelb', 7-'Lwri',
8-'Rhip', 9-'Rkne', 10-'Rank', 11-'Lhip', 12-'Lkne', 13-'Lank',
14-'Leye', 15-'Reye', 16-'Lear', 17-'Rear', 18-'pt19'}
H36M:https://user-images.githubusercontent.com/1342385/64393700-a65aa500-d020-11e9-9627-17e17a6f5790.png
{0-'hip', 1-'RHip', 2-'RKnee', 3-'RFoot', 4-'LHip',5-'LKnee', 6-'LFoot',
7-'Spine', 8-'Neck', 9-'Head', 10-'LShoulder', 11-'LElbow', 12-'LWrist',
13-'RShoulder', 14-'RElbow', 15-'RWrist'}
'''
img_show = img.copy()
for i in range(kps2d.shape[0]):
cv2.circle(img_show,(int(kps2d[i][0]),int(kps2d[i][1])),8,(0,255,0),-1)
for i in range(len(parent)):
start = parent[i]
if(start!=-1):
cv2.line(img_show,(int(kps2d[start][0]),int(kps2d[start][1])),(int(kps2d[i][0]),int(kps2d[i][1])),(0,255,255),8)
idx = 0
cv2.circle(img_show,(int(kps2d[idx][0]),int(kps2d[idx][1])),8,(0,0,255),-1)
plt.figure(figsize=(8,8))
plt.imshow(img_show[...,::-1]/255.0)
plt.axis('off')
'''
draw 3d kps
'''
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.scatter3D(kps3d[:,0],kps3d[:,1],kps3d[:,2],'green')
for i in range(len(parent)):
if(parent[i]!=-1):
ax.plot3D(kps3d[[i,parent[i]],0], kps3d[[i,parent[i]],1], kps3d[[i,parent[i]],2], 'gray')
ax.xaxis.set_tick_params(labelsize=10,colors='red')
ax.yaxis.set_tick_params(labelsize=10,colors='green')
ax.zaxis.set_tick_params(labelsize=10,colors='blue')
ax.view_init(elev=10., azim=180)
plt.show()
if __name__ == '__main__':
main()
请解析上面的代码内容:该代码实现了一个姿态估计的系统,包括2D姿态估计和3D姿态估计。其中,2D姿态估计使用了HRNet模型,3D姿态估计使用了SemGCN模型。具体实现细节如下:
1.初始化
定义了一个Pose3D类,该类包含了COCO_KEYPOINT_INDEXES、COCO_INSTANCE_CATEGORY_NAMES、pose_transform等变量,以及box_model、pose_model、model_pos等模型,并且将这些模型都加载到了GPU上。
2.获取人物检测框
通过传入一张图片,使用Faster R-CNN模型进行人物检测,并返回所有检测到的人的边界框。
3.2D姿态估计
使用HRNet模型对每个人的边界框进行2D姿态估计,并返回所有关键点的2D坐标。
4.3D姿态估计
使用SemGCN模型将2D姿态转换为3D姿态,并返回所有关键点的3D坐标。
5.可视化
使用matplotlib库将2D姿态和3D姿态可视化。
代码中还包含了一些辅助函数,例如box_to_center_scale()用于将边界框转换为姿态估计模型所需的中心点和缩放比例,normalize_screen_coordinates()用于将屏幕坐标归一化到[-1, 1]之间,convert2d()用于将COCO格式的2D姿态转换为H36M格式的2D姿态。
总的来说,该代码是一个完整的姿态估计系统,可以用于对图片或视频进行人体姿态估计,并提供2D和3D姿态的可视化结果。
代码中的部分细节解释:
CTX:用于指定模型运行的设备,如果GPU可用,则使用GPU,否则使用CPU。pose_transform:用于将图像转换为姿态估计模型所需的格式,包括归一化和转换为张量。box_model:Faster R-CNN模型,用于人物检测。pose_model:HRNet模型,用于2D姿态估计。model_pos:SemGCN模型,用于3D姿态估计。infer2d():用于进行2D姿态估计的函数。infer3d():用于进行3D姿态估计的函数。main():主函数,用于测试代码,并可视化结果。
代码中的部分注释:
# import _init_paths:表示该代码使用了其他库,例如HRNet和SemGCN库,需要将这些库的路径添加到系统环境变量中。# test video:表示代码中包含了使用视频进行姿态估计的代码,但被注释掉了。# draw kps:表示代码中包含了对2D和3D姿态进行可视化的代码。
代码中的部分可改进的地方:
- 代码中的注释可以更加详细,方便理解代码的功能和逻辑。
- 代码中可以添加更多错误处理,例如捕获异常等。
- 代码可以进行优化,例如使用多线程或多进程来提高效率。
总的来说,该代码是一个功能完整的姿态估计系统,可以作为学习和参考的例子。
原文地址: https://www.cveoy.top/t/topic/fZzr 著作权归作者所有。请勿转载和采集!