import torch import math import cv2 import numpy as np from PIL import Image, ImageDraw from torch.autograd import Variable from .get_nets import PNet, RNet, ONet from .utils import ( try_gpu, nms, calibrate_box, convert_to_square, correct_bboxes, get_image_boxes, generate_bboxes, preprocess, ) class FaceDetector: def __init__(self, device=None, dir_path=None): if device is None: device=try_gpu() self.device = device if dir_path is not None: self.pnet = PNet(dir_path).to(device) self.rnet = RNet(dir_path).to(device) self.onet = ONet(dir_path).to(device) self.onet.eval() else: # LOAD MODELS self.pnet = PNet().to(device) self.rnet = RNet().to(device) self.onet = ONet().to(device) self.onet.eval() def detect( self, image, min_face_size=20.0, thresholds=[0.6, 0.7, 0.8], nms_thresholds=[0.7, 0.7, 0.7], ): """ Arguments: image: an instance of PIL.Image. min_face_size: a float number. thresholds: a list of length 3. nms_thresholds: a list of length 3. Returns: two float numpy arrays of shapes [n_boxes, 5] and [n_boxes, 10], bounding boxes and facial landmarks. """ # this detector only support RGB Image input !!!!!!!!! # todo: fix eas if type(image) == np.ndarray: image = Image.fromarray(image) #image = Image.fromarray(cv2.cvtColor(image,cv2.COLOR_BGR2RGB)) # detector need Image Input # BUILD AN IMAGE PYRAMID width, height = image.size min_length = min(height, width) min_detection_size = 12 factor = 0.707 # sqrt(0.5) # scales for scaling the image scales = [] # scales the image so that # minimum size that we can detect equals to # minimum face size that we want to detect m = min_detection_size / min_face_size min_length *= m factor_count = 0 while min_length > min_detection_size: scales.append(m * factor ** factor_count) min_length *= factor factor_count += 1 # STAGE 1 # it will be returned bounding_boxes = [] # run P-Net on different scales for s in scales: boxes = self.__run_first_stage(image, scale=s, threshold=thresholds[0]) bounding_boxes.append(boxes) # collect boxes (and offsets, and scores) from different scales bounding_boxes = [i for i in bounding_boxes if i is not None] bounding_boxes = np.vstack(bounding_boxes) keep = nms(bounding_boxes[:, 0:5], nms_thresholds[0]) bounding_boxes = bounding_boxes[keep] # use offsets predicted by pnet to transform bounding boxes bounding_boxes = calibrate_box(bounding_boxes[:, 0:5], bounding_boxes[:, 5:]) # shape [n_boxes, 5] bounding_boxes = convert_to_square(bounding_boxes) bounding_boxes[:, 0:4] = np.round(bounding_boxes[:, 0:4]) # STAGE 2 img_boxes = get_image_boxes(bounding_boxes, image, size=24) with torch.no_grad(): img_boxes = Variable(torch.FloatTensor(img_boxes).to(self.device)) output = self.rnet(img_boxes) offsets = output[0].cpu().data.numpy() # shape [n_boxes, 4] probs = output[1].cpu().data.numpy() # shape [n_boxes, 2] keep = np.where(probs[:, 1] > thresholds[1])[0] bounding_boxes = bounding_boxes[keep] bounding_boxes[:, 4] = probs[keep, 1].reshape((-1,)) offsets = offsets[keep] keep = nms(bounding_boxes, nms_thresholds[1]) bounding_boxes = bounding_boxes[keep] bounding_boxes = calibrate_box(bounding_boxes, offsets[keep]) bounding_boxes = convert_to_square(bounding_boxes) bounding_boxes[:, 0:4] = np.round(bounding_boxes[:, 0:4]) # STAGE 3 img_boxes = get_image_boxes(bounding_boxes, image, size=48) if len(img_boxes) == 0: return [], [] with torch.no_grad(): img_boxes = Variable(torch.FloatTensor(img_boxes).to(self.device)) output = self.onet(img_boxes) landmarks = output[0].cpu().data.numpy() # shape [n_boxes, 10] offsets = output[1].cpu().data.numpy() # shape [n_boxes, 4] probs = output[2].cpu().data.numpy() # shape [n_boxes, 2] keep = np.where(probs[:, 1] > thresholds[2])[0] bounding_boxes = bounding_boxes[keep] bounding_boxes[:, 4] = probs[keep, 1].reshape((-1,)) offsets = offsets[keep] landmarks = landmarks[keep] # compute landmark points width = bounding_boxes[:, 2] - bounding_boxes[:, 0] + 1.0 height = bounding_boxes[:, 3] - bounding_boxes[:, 1] + 1.0 xmin, ymin = bounding_boxes[:, 0], bounding_boxes[:, 1] landmarks[:, 0:5] = ( np.expand_dims(xmin, 1) + np.expand_dims(width, 1) * landmarks[:, 0:5] ) landmarks[:, 5:10] = ( np.expand_dims(ymin, 1) + np.expand_dims(height, 1) * landmarks[:, 5:10] ) bounding_boxes = calibrate_box(bounding_boxes, offsets) keep = nms(bounding_boxes, nms_thresholds[2], mode="min") bounding_boxes = bounding_boxes[keep] landmarks = landmarks[keep] # reshape [x1,x2,..x5,y1,..y5] to [[x1,y1],...[x5,y5]] landmarks = [np.array(ld).reshape((5,2), order="F") for ld in landmarks] return bounding_boxes, landmarks def safe_detect(self, image, min_face_size=20.0, thresholds=[0.6, 0.7, 0.8], nms_thresholds=[0.7, 0.7, 0.7], score_thresholds=0.90): try: bbox, ld = self.detect(image, min_face_size, thresholds, nms_thresholds) _bbox = [] _ld = [] for idx,_ in enumerate(bbox): if bbox[idx][-1] >= score_thresholds: _bbox.append(bbox[idx]) _ld.append(ld[idx]) return _bbox, _ld except: return [], [] def draw_bboxes(self, image): """Draw bounding boxes and facial landmarks. Arguments: image: an instance of PIL.Image. Returns: an instance of PIL.Image. """ bounding_boxes, facial_landmarks = self.detect(image) img_copy = image.copy() draw = ImageDraw.Draw(img_copy) for b in bounding_boxes: draw.rectangle([(b[0], b[1]), (b[2], b[3])], outline="white") for p in facial_landmarks: for i in range(5): draw.ellipse( [(p[i] - 1.0, p[i + 5] - 1.0), (p[i] + 1.0, p[i + 5] + 1.0)], outline="blue", ) return img_copy def crop_faces(self, image, size=112): """Crop all face images. Arguments: image: an instance of PIL.Image. size: the side length of output images. Returns: a list of PIL.Image instances """ bounding_boxes, _ = self.detect(image) img_list = [] # convert bboxes to square square_bboxes = convert_to_square(bounding_boxes) for b in square_bboxes: face_img = image.crop((b[0], b[1], b[2], b[3])) face_img = face_img.resize((size, size), Image.BILINEAR) img_list.append(face_img) return img_list def __run_first_stage(self, image, scale, threshold): """Run P-Net, generate bounding boxes, and do NMS. Arguments: image: an instance of PIL.Image. scale: a float number, scale width and height of the image by this number. threshold: a float number, threshold on the probability of a face when generating bounding boxes from predictions of the net. Returns: a float numpy array of shape [n_boxes, 9], bounding boxes with scores and offsets (4 + 1 + 4). """ # scale the image and convert it to a float array width, height = image.size sw, sh = math.ceil(width * scale), math.ceil(height * scale) img = image.resize((sw, sh), Image.BILINEAR) img = np.asarray(img, "float32") with torch.no_grad(): img = Variable(torch.FloatTensor(preprocess(img)).to(self.device)) output = self.pnet(img) probs = output[1].cpu().data.numpy()[0, 1, :, :] offsets = output[0].cpu().data.numpy() # probs: probability of a face at each sliding window # offsets: transformations to true bounding boxes boxes = generate_bboxes(probs, offsets, scale, threshold) if len(boxes) == 0: return None keep = nms(boxes[:, 0:5], overlap_threshold=0.5) return boxes[keep]