add face-2d-keypoints

Link: https://code.alibaba-inc.com/pai-vision/EasyCV/codereview/9809249 * add face 2d keypoint config
2025-06-03 14:49:00 +08:00 · 2022-08-25 16:57:37 +08:00 · 2022-08-25 16:57:37 +08:00 · 2e8fc44dc1
commit 2e8fc44dc1
parent bc64851614
30 changed files with 1787 additions and 2 deletions
--- a/configs/face/face_96x96_wingloss.py
+++ b/configs/face/face_96x96_wingloss.py
@ -0,0 +1,236 @@
+# model settings
+POINT_NUMBER = 106
+MEAN_FACE = [
+    0.05486667535113006, 0.24441904048908245, 0.05469932714062696,
+    0.30396829196709935, 0.05520653400164321, 0.3643191463607746,
+    0.05865501342257397, 0.42453849020500306, 0.0661603899137523,
+    0.48531377442945767, 0.07807677169271177, 0.5452126843738523,
+    0.09333319368757653, 0.6047840615432064, 0.11331425394034209,
+    0.6631144309665994, 0.13897813867699352, 0.7172296230155276,
+    0.17125811033538194, 0.767968859462583, 0.20831698519371536,
+    0.8146603379935117, 0.24944621000897876, 0.857321261721953,
+    0.2932993820558674, 0.8973900596678597, 0.33843820185594653,
+    0.9350576242126986, 0.38647802623495553, 0.966902971122812,
+    0.4411974776504609, 0.9878629960611088, 0.5000390697219397,
+    0.9934886214875595, 0.5588590024515473, 0.9878510782414189,
+    0.6135829360035883, 0.9668655595323074, 0.6616294188166414,
+    0.9350065330378543, 0.7067734980023662, 0.8973410411573094,
+    0.7506167730772516, 0.8572957679511382, 0.7917579157122047,
+    0.8146281598803492, 0.8288026446367324, 0.7679019642224981,
+    0.8610918526053805, 0.7171624168757985, 0.8867491048162915,
+    0.6630344261248556, 0.9067293813428708, 0.6047095492618413,
+    0.9219649147678989, 0.5451295187190602, 0.9338619041815587,
+    0.4852292097262674, 0.9413455695142587, 0.424454780475834,
+    0.9447753107545577, 0.3642347111991026, 0.9452649776939869,
+    0.30388458223793025, 0.9450854849661369, 0.24432737691068557,
+    0.1594802473020129, 0.17495177946520288, 0.2082918411850002,
+    0.12758378330875153, 0.27675902873293057, 0.11712230823088154,
+    0.34660582049732336, 0.12782553369032904, 0.4137234315527489,
+    0.14788458441422778, 0.4123890243720449, 0.18814226684806626,
+    0.3498927810760776, 0.17640650480816664, 0.28590212091591866,
+    0.16895271174960227, 0.22193967489846017, 0.16985862149585013,
+    0.5861805004572298, 0.147863456192582, 0.6532904167464643,
+    0.12780412047734288, 0.723142364263288, 0.11709102395419578,
+    0.7916076475508984, 0.12753867695205595, 0.8404440227263494,
+    0.17488715120168932, 0.7779848023963316, 0.1698261195288917,
+    0.7140264757991571, 0.1689377237959271, 0.650024882334848,
+    0.17640581823811927, 0.5875270068157493, 0.18815421057605972,
+    0.4999687027691624, 0.2770570778583906, 0.49996466107378934,
+    0.35408433007759227, 0.49996725190415664, 0.43227025345368053,
+    0.49997367716346774, 0.5099309118810921, 0.443147025685285,
+    0.2837021691260901, 0.4079306716593004, 0.4729519900478952,
+    0.3786223176615041, 0.5388017782630576, 0.4166237366074797,
+    0.5822229552544941, 0.4556754522760756, 0.5887956328134262,
+    0.49998730493119997, 0.5951855531982454, 0.5443300921009105,
+    0.5887796732983633, 0.5833722476054509, 0.582200985012979,
+    0.6213509190608012, 0.5387760772258134, 0.5920137550293199,
+    0.4729325070035326, 0.5567854054587345, 0.28368589871138317,
+    0.23395988420439123, 0.275313734012504, 0.27156519109550253,
+    0.2558735678926061, 0.31487949633428597, 0.2523033259214858,
+    0.356919009399118, 0.2627342680634766, 0.3866625969903256,
+    0.2913618036573405, 0.3482919069920915, 0.3009936818974329,
+    0.3064437008415846, 0.3037349617842158, 0.26724000706363993,
+    0.2961896087804692, 0.3135744691699477, 0.27611103614975246,
+    0.6132904312551143, 0.29135144033587107, 0.6430396927648264,
+    0.2627079452269443, 0.6850713556136455, 0.2522730391144915,
+    0.728377707003201, 0.25583118190779625, 0.7660035591791254,
+    0.27526375689471777, 0.7327054300488236, 0.2961495286346863,
+    0.6935171517115648, 0.3036951925380769, 0.6516533228539426,
+    0.3009921014909089, 0.6863983789278025, 0.2760904908649394,
+    0.35811903020866753, 0.7233174007629063, 0.4051199834269763,
+    0.6931800846807724, 0.4629631471997891, 0.6718031951363689,
+    0.5000016063148277, 0.6799150331999366, 0.5370506360177653,
+    0.6717809139952097, 0.5948714927411151, 0.6931581144392573,
+    0.6418878095835022, 0.7232890570786875, 0.6088129582142587,
+    0.7713407215524752, 0.5601450388292929, 0.8052499757498277,
+    0.5000181358125715, 0.8160749831906926, 0.4398905591799545,
+    0.8052697696938342, 0.39120318265892984, 0.771375905028864,
+    0.36888771299734613, 0.7241751210643214, 0.4331097084010058,
+    0.7194543690519717, 0.5000188612450743, 0.7216823277180712,
+    0.566895861884284, 0.7194302225129479, 0.631122598507516,
+    0.7241462073974219, 0.5678462302796355, 0.7386355816766528,
+    0.5000082906571756, 0.7479600838019628, 0.43217532542902076,
+    0.7386538729390463, 0.31371761254774383, 0.2753328284323114,
+    0.6862487843823917, 0.2752940437017121
+]
+IMAGE_SIZE = 96
+
+loss_config = dict(
+    num_points=POINT_NUMBER,
+    left_eye_left_corner_index=66,
+    right_eye_right_corner_index=79,
+    points_weight=1.0,
+    contour_weight=1.5,
+    eyebrow_weight=1.5,
+    eye_weight=1.7,
+    nose_weight=1.3,
+    lip_weight=1.7,
+    omega=10,
+    epsilon=2)
+
+model = dict(
+    type='FaceKeypoint',
+    backbone=dict(
+        type='FaceKeypointBackbone',
+        in_channels=3,
+        out_channels=48,
+        residual_activation='relu',
+        inverted_activation='half_v2',
+        inverted_expand_ratio=2,
+    ),
+    keypoint_head=dict(
+        type='FaceKeypointHead',
+        in_channels=48,
+        out_channels=POINT_NUMBER * 2,
+        input_size=IMAGE_SIZE,
+        inverted_expand_ratio=2,
+        inverted_activation='half_v2',
+        mean_face=MEAN_FACE,
+        loss_keypoint=dict(type='WingLossWithPose', **loss_config),
+    ),
+    pose_head=dict(
+        type='FacePoseHead',
+        in_channels=48,
+        out_channels=3,
+        inverted_expand_ratio=2,
+        inverted_activation='half_v2',
+        loss_pose=dict(type='FacePoseLoss', pose_weight=0.01),
+    ),
+)
+
+train_pipeline = [
+    dict(type='FaceKeypointRandomAugmentation', input_size=IMAGE_SIZE),
+    dict(type='FaceKeypointNorm', input_size=IMAGE_SIZE),
+    dict(type='MMToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.4076, 0.458, 0.485],
+        std=[1.0, 1.0, 1.0]),
+    dict(
+        type='Collect',
+        keys=[
+            'img', 'target_point', 'target_point_mask', 'target_pose',
+            'target_pose_mask'
+        ])
+]
+
+val_pipeline = [
+    dict(type='FaceKeypointNorm', input_size=IMAGE_SIZE),
+    dict(type='MMToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.4076, 0.458, 0.485],
+        std=[1.0, 1.0, 1.0]),
+    dict(
+        type='Collect',
+        keys=[
+            'img', 'target_point', 'target_point_mask', 'target_pose',
+            'target_pose_mask'
+        ])
+]
+test_pipeline = val_pipeline
+
+data_root = 'path/to/face_landmark_data/'
+
+data_cfg = dict(
+    data_root=data_root,
+    input_size=IMAGE_SIZE,
+)
+
+data = dict(
+    imgs_per_gpu=512,
+    workers_per_gpu=2,
+    train=dict(
+        type='FaceKeypointDataset',
+        data_source=dict(
+            type='FaceKeypintSource',
+            train=True,
+            data_range=[0, 30000],  # [0,30000]  [0,478857]
+            data_cfg=data_cfg,
+        ),
+        pipeline=train_pipeline),
+    val=dict(
+        type='FaceKeypointDataset',
+        data_source=dict(
+            type='FaceKeypintSource',
+            train=False,
+            data_range=[478857, 488857],
+            # data_range=[478857, 478999], #[478857, 478999] [478857, 488857]
+            data_cfg=data_cfg,
+        ),
+        pipeline=val_pipeline),
+    test=dict(
+        type='FaceKeypointDataset',
+        data_source=dict(
+            type='FaceKeypintSource',
+            train=False,
+            data_range=[478857, 488857],
+            # data_range=[478857, 478999], #[478857, 478999] [478857, 488857]
+            data_cfg=data_cfg,
+        ),
+        pipeline=test_pipeline),
+)
+
+# runtime setting
+optimizer = dict(
+    type='Adam',
+    lr=0.005,
+)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(
+    policy='CosineAnnealing',
+    min_lr=0.00001,
+    warmup='linear',
+    warmup_iters=10,
+    warmup_ratio=0.001,
+    warmup_by_epoch=True,
+    by_epoch=True)
+
+total_epochs = 1000
+checkpoint_config = dict(interval=10)
+log_config = dict(
+    interval=5, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+
+# disable opencv multithreading to avoid system being overloaded
+opencv_num_threads = 0
+# set multi-process start method as `fork` to speed up the training
+mp_start_method = 'fork'
+
+evaluation = dict(interval=1, metric=['NME'], save_best='NME')
+
+eval_config = dict(interval=1)
+evaluator_args = dict(metric_names='ave_nme')
+eval_pipelines = [
+    dict(
+        mode='test',
+        data=dict(**data['val'], imgs_per_gpu=1),
+        evaluators=[dict(type='FaceKeypointEvaluator', **evaluator_args)])
+]
--- a/data/test/face_2d_keypoints/data/002253.png
+++ b/data/test/face_2d_keypoints/data/002253.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a45cc56977e709361659d4123739d3647d122a0d80bf7249d0ccdef018f068e
+size 112042
--- a/data/test/face_2d_keypoints/data/002258.png
+++ b/data/test/face_2d_keypoints/data/002258.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d516f30a8c1583b45e54d737d2a712ed1c63ef387d579517e1e23e416339ac2
+size 94367
--- a/data/test/face_2d_keypoints/models/epoch_580.pth
+++ b/data/test/face_2d_keypoints/models/epoch_580.pth
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5105c5aa83c59d2a1fdf8dc9ff83a8d84c19a70c7faabcf7f8bce8a913afe4f1
+size 3421031
--- a/easycv/core/evaluation/init.py
+++ b/easycv/core/evaluation/init.py
@ -3,6 +3,7 @@ from .auc_eval import AucEvaluator
 from .base_evaluator import Evaluator
 from .classification_eval import ClsEvaluator
 from .coco_evaluation import CocoDetectionEvaluator, CoCoPoseTopDownEvaluator
+from .face_eval import FaceKeypointEvaluator
 from .faceid_pair_eval import FaceIDPairEvaluator
 from .keypoint_eval import KeyPointEvaluator
 from .mse_eval import MSEEvaluator
--- a/easycv/core/evaluation/face_eval.py
+++ b/easycv/core/evaluation/face_eval.py
@ -0,0 +1,59 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import torch
+
+from .base_evaluator import Evaluator
+from .builder import EVALUATORS
+from .metric_registry import METRICS
+
+
+@EVALUATORS.register_module
+class FaceKeypointEvaluator(Evaluator):
+
+    def __init__(self, dataset_name=None, metric_names=['ave_nme']):
+        super(FaceKeypointEvaluator, self).__init__(dataset_name, metric_names)
+        self.metric = metric_names
+        self.dataset_name = dataset_name
+
+    def _evaluate_impl(self, prediction_dict, groundtruth_dict, **kwargs):
+        """
+        Args:
+            prediction_dict: model forward output dict, ['point', 'pose']
+            groundtruth_dict: groundtruth dict, ['target_point', 'target_point_mask', 'target_pose', 'target_pose_mask'] used for compute accuracy
+            kwargs: other parameters
+        """
+
+        def evaluate(predicts, gts, **kwargs):
+            from easycv.models.utils.face_keypoint_utils import get_keypoint_accuracy, get_pose_accuracy
+            ave_pose_acc = 0
+            ave_nme = 0
+            idx = 0
+
+            for (predict_point, predict_pose,
+                 gt) in zip(predicts['point'], predicts['pose'], gts):
+                target_point = gt['target_point']
+                target_point_mask = gt['target_point_mask']
+                target_pose = gt['target_pose']
+                target_pose_mask = gt['target_pose_mask']
+
+                target_point = target_point * target_point_mask
+                target_pose = target_pose * target_pose_mask
+
+                keypoint_accuracy = get_keypoint_accuracy(
+                    predict_point, target_point)
+                pose_accuracy = get_pose_accuracy(predict_pose, target_pose)
+
+                ave_pose_acc += pose_accuracy['pose_acc']
+                ave_nme += keypoint_accuracy['nme']
+                idx += 1
+
+            eval_result = {}
+            idx += 0.000001
+            eval_result['ave_pose_acc'] = ave_pose_acc / idx
+            eval_result['ave_nme'] = ave_nme / idx
+
+            return eval_result
+
+        return evaluate(prediction_dict, groundtruth_dict)
+
+
+METRICS.register_default_best_metric(FaceKeypointEvaluator, 'ave_nme', 'min')
--- a/easycv/datasets/init.py
+++ b/easycv/datasets/init.py
@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from . import classification, detection, pose, segmentation, selfsup, shared
+from . import (classification, detection, face, pose, segmentation, selfsup,
+               shared)
 from .builder import build_dali_dataset, build_dataset
 from .loader import DistributedGroupSampler, GroupSampler, build_dataloader
 from .registry import DATASETS
--- a/easycv/datasets/face/init.py
+++ b/easycv/datasets/face/init.py
@ -0,0 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .data_sources import *
+from .face_keypoint_dataset import FaceKeypointDataset
+from .pipelines import *
--- a/easycv/datasets/face/data_sources/init.py
+++ b/easycv/datasets/face/data_sources/init.py
@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .face_keypoint_source import FaceKeypintSource
--- a/easycv/datasets/face/data_sources/face_keypoint_source.py
+++ b/easycv/datasets/face/data_sources/face_keypoint_source.py
@ -0,0 +1,171 @@
+import copy
+import json
+import logging
+import os
+
+import cv2
+import numpy as np
+import torch
+
+from easycv.datasets.face.pipelines.face_keypoint_transform import (
+    FaceKeypointNorm, FaceKeypointRandomAugmentation, normal)
+from easycv.datasets.registry import DATASOURCES
+from easycv.datasets.shared.base import BaseDataset
+
+FACE_KEYPOINT_DATASET_INFO = dict(
+    real_list_file_dir='real_face_list.txt',
+    data_info_dir='infos/merge/',
+    data_image_dir='images/merge/',
+    data_overlay_dir='images/overlay/',
+)
+
+
+@DATASOURCES.register_module()
+class FaceKeypintSource():
+    """
+        load dataset for face key points
+    """
+
+    def __init__(self,
+                 data_cfg,
+                 data_range,
+                 real_list_path=None,
+                 info_path=None,
+                 image_path=None,
+                 data_overlay_path=None,
+                 dataset_info=None,
+                 **kwargs):
+        super(FaceKeypintSource, self).__init__()
+        """
+        Args:
+            data_cfg: Data config dict
+            data_range: rang of dataset for training or validation
+            real_list_file_path: path of file contains image list
+            data_info_dir: annotation file path
+            data_img_dir: image file path
+            data_overlay_dir: overlay background image path
+
+            dataset_info: A dict containing all dataset info
+        """
+        if dataset_info is None:
+            logging.info(
+                'dataset_info is missing, use default face keypoiny dataset info'
+            )
+            dataset_info = FACE_KEYPOINT_DATASET_INFO
+
+        data_root = data_cfg['data_root']
+        real_list_file_path = os.path.join(data_root,
+                                           dataset_info['real_list_file_dir'])
+        data_info_dir = os.path.join(data_root, dataset_info['data_info_dir'])
+        data_img_dir = os.path.join(data_root, dataset_info['data_image_dir'])
+        data_overlay_dir = os.path.join(data_root,
+                                        dataset_info['data_overlay_dir'])
+        self.input_size = data_cfg['input_size']
+        data_range = data_range
+
+        if real_list_path is not None:
+            real_list_file_path = real_list_path
+        if info_path is not None:
+            data_info_dir = info_path
+        if image_path is not None:
+            data_img_dir = image_path
+        if data_overlay_path is not None:
+            data_overlay_dir = data_overlay_path
+
+        # overlay
+        self.overlay_image_path = []
+        for overlay_img_file in sorted(os.listdir(data_overlay_dir)):
+            overlay_img_filepath = os.path.join(data_overlay_dir,
+                                                overlay_img_file)
+            self.overlay_image_path.append(overlay_img_filepath)
+
+        self.points_and_pose_datas = []
+        with open(real_list_file_path, 'r') as real_list_file:
+            real_list_lines = real_list_file.readlines()
+        for index in range(data_range[0], data_range[1]):
+            idx = int(real_list_lines[index])
+            img_path = os.path.join(data_img_dir, '{:06d}.png'.format(idx))
+            if not os.path.exists(img_path):
+                logging.warning('image %s does not exist' % img_path)
+                continue
+            info_path = os.path.join(data_info_dir, '{:06d}.json'.format(idx))
+            if not os.path.exists(info_path):
+                logging.warning('annotation %s does not exist' % info_path)
+                continue
+            with open(info_path, 'r') as info_file:
+                info_json = json.load(info_file)
+                assert info_json['face_count'] == 1
+                base_info = info_json['face_infos'][0]['base_info']
+
+                # points
+                assert base_info['points_array'] is not None
+                points = np.asarray(base_info['points_array']).astype(
+                    np.float32)
+                points_mask = np.abs(points - (-999)) > 0.0001
+
+                # pose
+                pose = {'pitch': -999, 'yaw': -999, 'roll': -999}
+                if base_info['pitch'] is not None and base_info[
+                        'yaw'] is not None and base_info['roll'] is not None:
+                    pose['pitch'] = base_info['pitch']
+                    pose['yaw'] = base_info['yaw']
+                    # pose["roll"] = base_info["roll"]
+                    # datasets have been preprocessed, roll=0
+                    # add noise to pose
+                    pose['roll'] = normal() * 10.0
+
+                pose_mask = np.asarray([
+                    np.abs(pose['pitch'] - (-999)) > 0.0001,
+                    np.abs(pose['roll'] - (-999)) > 0.0001,
+                    np.abs(pose['yaw'] - (-999)) > 0.0001
+                ])
+
+            self.points_and_pose_datas.append(
+                (img_path, points, points_mask, pose, pose_mask))
+
+        self.db = []
+        for img_path, points, points_mask, pose, pose_mask in copy.deepcopy(
+                self.points_and_pose_datas):
+            image = cv2.imread(img_path)
+
+            points[:,
+                   0] = points[:, 0] / image.shape[1] * float(self.input_size)
+            points[:,
+                   1] = points[:, 1] / image.shape[0] * float(self.input_size)
+
+            target_point = np.reshape(points,
+                                      (points.shape[0] * points.shape[1]))
+            points_mask = points_mask.astype(np.float32)
+            points_mask = np.reshape(
+                points_mask, (points_mask.shape[0] * points_mask.shape[1]))
+            pose = np.asarray([pose['pitch'], pose['roll'], pose['yaw']])
+
+            self.db.append({
+                'img_path':
+                img_path,
+                'target_point':
+                torch.tensor(np.array(target_point, np.float32)),
+                'target_point_mask':
+                torch.tensor(points_mask),
+                'target_pose':
+                torch.tensor(np.array(pose, np.float32)),
+                'target_pose_mask':
+                torch.tensor(pose_mask.astype(np.float32))
+            })
+
+    def __getitem__(self, index):
+        img_path, points, points_mask, pose, pose_mask = copy.deepcopy(
+            self.points_and_pose_datas[index])
+        image = cv2.imread(img_path)
+
+        return {
+            'img': image,
+            'target_point': points,
+            'target_point_mask': points_mask,
+            'target_pose': pose,
+            'target_pose_mask': pose_mask,
+            'overlay_image_path': self.overlay_image_path
+        }
+
+    def __len__(self):
+        return len(self.points_and_pose_datas)
--- a/easycv/datasets/face/face_keypoint_dataset.py
+++ b/easycv/datasets/face/face_keypoint_dataset.py
@ -0,0 +1,45 @@
+import copy
+import json
+import logging
+import os
+
+import cv2
+import numpy as np
+import torch
+import torch.utils.data as data
+
+from easycv.datasets.face.pipelines.face_keypoint_transform import (
+    FaceKeypointNorm, FaceKeypointRandomAugmentation, normal)
+from easycv.datasets.registry import DATASETS
+from easycv.datasets.shared.base import BaseDataset
+
+
+@DATASETS.register_module()
+class FaceKeypointDataset(BaseDataset):
+    """
+        dataset for face key points
+    """
+
+    def __init__(self, data_source, pipeline, profiling=False):
+        super(FaceKeypointDataset, self).__init__(data_source, pipeline,
+                                                  profiling)
+        """
+        Args:
+            data_source: Data_source config dict
+            pipeline: Pipeline config list
+            profiling: If set True, will print pipeline time
+        """
+
+    def evaluate(self, outputs, evaluators, **kwargs):
+        eval_result = {}
+        for evaluator in evaluators:
+            eval_result.update(
+                evaluator.evaluate(
+                    prediction_dict=outputs,
+                    groundtruth_dict=self.data_source.db))
+
+        return eval_result
+
+    def __getitem__(self, idx):
+        results = self.data_source[idx]
+        return self.pipeline(results)
--- a/easycv/datasets/face/pipelines/init.py
+++ b/easycv/datasets/face/pipelines/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .face_keypoint_transform import (FaceKeypointNorm,
+                                      FaceKeypointRandomAugmentation)
+
+__all__ = ['FaceKeypointRandomAugmentation', 'FaceKeypointNorm']
--- a/easycv/datasets/face/pipelines/face_keypoint_transform.py
+++ b/easycv/datasets/face/pipelines/face_keypoint_transform.py
@ -0,0 +1,431 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import random
+
+import cv2
+import imgaug
+import imgaug.augmenters as iaa
+import numpy as np
+
+from easycv.datasets.registry import PIPELINES
+
+DEST_SIZE = 256
+BASE_LANDMARK_NUM = 106
+ENLARGE_RATIO = 1.1
+
+CONTOUR_PARTS = [[0, 32], [1, 31], [2, 30], [3, 29], [4, 28], [5, 27], [6, 26],
+                 [7, 25], [8, 24], [9, 23], [10, 22], [11, 21], [12, 20],
+                 [13, 19], [14, 18], [15, 17]]
+BROW_PARTS = [[33, 46], [34, 45], [35, 44], [36, 43], [37, 42], [38, 50],
+              [39, 49], [40, 48], [41, 47]]
+EYE_PARTS = [[66, 79], [67, 78], [68, 77], [69, 76], [70, 75], [71, 82],
+             [72, 81], [73, 80], [74, 83]]
+NOSE_PARTS = [[55, 65], [56, 64], [57, 63], [58, 62], [59, 61]]
+MOUSE_PARTS = [[84, 90], [85, 89], [86, 88], [96, 100], [97, 99], [103, 101],
+               [95, 91], [94, 92]]
+IRIS_PARTS = [[104, 105]]
+MATCHED_PARTS = CONTOUR_PARTS + BROW_PARTS + EYE_PARTS + NOSE_PARTS + MOUSE_PARTS + IRIS_PARTS
+
+
+def normal():
+    """
+    3-sigma rule
+    return: (-1, +1)
+    """
+    mu, sigma = 0, 1
+    while True:
+        s = np.random.normal(mu, sigma)
+        if s < mu - 3 * sigma or s > mu + 3 * sigma:
+            continue
+        return s / 3 * sigma
+
+
+def rotate(angle, center, landmark):
+    rad = angle * np.pi / 180.0
+    alpha = np.cos(rad)
+    beta = np.sin(rad)
+    M = np.zeros((2, 3), dtype=np.float32)
+    M[0, 0] = alpha
+    M[0, 1] = beta
+    M[0, 2] = (1 - alpha) * center[0] - beta * center[1]
+    M[1, 0] = -beta
+    M[1, 1] = alpha
+    M[1, 2] = beta * center[0] + (1 - alpha) * center[1]
+
+    landmark_ = np.asarray([(M[0, 0] * x + M[0, 1] * y + M[0, 2],
+                             M[1, 0] * x + M[1, 1] * y + M[1, 2])
+                            for (x, y) in landmark])
+    return M, landmark_
+
+
+class OverLayGenerator:
+
+    def __init__(self, shape):
+        # 4x4
+        h_seg_len = shape[0] // 4
+        w_seg_len = shape[1] // 4
+
+        self.overlay = []
+        # 2x2 overlay
+        for i in range(3):
+            for j in range(3):
+                if i == 1 and j == 1:
+                    continue
+                self.overlay.append((i * w_seg_len, j * h_seg_len,
+                                     2 * w_seg_len, 2 * h_seg_len))
+
+        # 2x3 overlay
+        for i in range(3):
+            for j in range(2):
+                if i == 1:
+                    continue
+                self.overlay.append((i * w_seg_len, j * h_seg_len,
+                                     2 * w_seg_len, 3 * h_seg_len))
+        for i in range(2):
+            for j in range(3):
+                if j == 1:
+                    continue
+                self.overlay.append((i * w_seg_len, j * h_seg_len,
+                                     3 * w_seg_len, 2 * h_seg_len))
+
+        # 2x4 overlay
+        for i in range(3):
+            for j in range(1):
+                if i == 1:
+                    continue
+                self.overlay.append((i * w_seg_len, j * h_seg_len,
+                                     2 * w_seg_len, 4 * h_seg_len))
+        for i in range(1):
+            for j in range(3):
+                if j == 1:
+                    continue
+                self.overlay.append((i * w_seg_len, j * h_seg_len,
+                                     4 * w_seg_len, 2 * h_seg_len))
+
+
+class FaceKeypointsDataAugumentation:
+
+    def __init__(self, input_size):
+        # option
+        self.enable_flip = True
+        self.enable_rotate = True
+        self.input_size = input_size
+
+        # mask generator
+        coarse_salt_and_pepper_iaa = iaa.CoarseSaltAndPepper(
+            (0.25, 0.35), size_percent=(0.03125, 0.015625))
+        self.mask_generator = coarse_salt_and_pepper_iaa.mask
+
+        # overlay generator
+        self.overlay_generator = OverLayGenerator(shape=(256, 256))
+
+        # flip
+        self.mirror_map = FaceKeypointsDataAugumentation.compute_mirror_map()
+
+    @staticmethod
+    def compute_mirror_map():
+
+        mirror_map = np.array(range(0, BASE_LANDMARK_NUM), np.int32)
+        for x, y in MATCHED_PARTS:
+            mirror_map[x] = y
+            mirror_map[y] = x
+
+        return mirror_map
+
+    def aug_flip(self, img, pts, visibility, pose):
+        # pts[:, 0] = self.input_size - pts[:, 0]
+        pts[:, 0] = img.shape[1] - pts[:, 0]
+        pts = pts[self.mirror_map]
+        if visibility is not None:
+            visibility = visibility[self.mirror_map]
+        img = cv2.flip(img, 1)
+        if pose is not None:
+            # fix roll&yaw in pose
+            pose['roll'] = -pose['roll']
+            pose['yaw'] = -pose['yaw']
+
+        return img, pts, visibility, pose
+
+    def aug_rotate(self, img, pts, pose, angle):
+        center = [DEST_SIZE // 2, DEST_SIZE // 2]
+        if pose is not None:
+            # fix roll in pose
+            pose['roll'] += angle
+
+        cx, cy = center
+        M, pts = rotate(angle, (cx, cy), pts)
+
+        imgT = cv2.warpAffine(img, M, (int(img.shape[1]), int(img.shape[0])))
+
+        x1 = np.min(pts[:, 0])
+        x2 = np.max(pts[:, 0])
+        y1 = np.min(pts[:, 1])
+        y2 = np.max(pts[:, 1])
+        w = x2 - x1 + 1
+        h = y2 - y1 + 1
+        x1 = int(x1 - (ENLARGE_RATIO - 1.0) / 2.0 * w)
+        y1 = int(y1 - (ENLARGE_RATIO - 1.0) * h)
+
+        new_w = int(ENLARGE_RATIO * (1 + normal() * 0.25) * w)
+        new_h = int(ENLARGE_RATIO * (1 + normal() * 0.25) * h)
+        new_x1 = x1 + int(normal() * DEST_SIZE * 0.15)
+        new_y1 = y1 + int(normal() * DEST_SIZE * 0.15)
+        new_x2 = new_x1 + new_w
+        new_y2 = new_y1 + new_h
+
+        new_xy = new_x1, new_y1
+        pts = pts - new_xy
+
+        height, width, _ = imgT.shape
+        dx = max(0, -new_x1)
+        dy = max(0, -new_y1)
+        new_x1 = max(0, new_x1)
+        new_y1 = max(0, new_y1)
+
+        edx = max(0, new_x2 - width)
+        edy = max(0, new_y2 - height)
+        new_x2 = min(width, new_x2)
+        new_y2 = min(height, new_y2)
+
+        imgT = imgT[new_y1:new_y2, new_x1:new_x2]
+        if dx > 0 or dy > 0 or edx > 0 or edy > 0:
+            imgT = cv2.copyMakeBorder(
+                imgT,
+                dy,
+                edy,
+                dx,
+                edx,
+                cv2.BORDER_CONSTANT,
+                value=(103.94, 116.78, 123.68))
+
+        return imgT, pts, pose
+
+    def random_mask(self, img):
+        mask = self.mask_generator.draw_samples(size=img.shape)
+        mask = np.expand_dims(np.sum(mask, axis=-1) > 0, axis=-1)
+        return mask
+
+    def random_overlay(self):
+        index = np.random.choice(len(self.overlay_generator.overlay))
+        overlay = self.overlay_generator.overlay[index]
+        return overlay
+
+    def augment_blur(self, img):
+        h, w = img.shape[:2]
+        assert h == w
+        ssize = int(random.uniform(0.01, 0.5) * h)
+        aug_seq = iaa.Sequential([
+            iaa.Sometimes(
+                1.0,
+                iaa.OneOf([
+                    iaa.GaussianBlur((3, 15)),
+                    iaa.AverageBlur(k=(3, 15)),
+                    iaa.MedianBlur(k=(3, 15)),
+                    iaa.MotionBlur((5, 25))
+                ])),
+            iaa.Resize(ssize, interpolation=imgaug.ALL),
+            iaa.Sometimes(
+                0.6,
+                iaa.OneOf([
+                    iaa.AdditiveGaussianNoise(
+                        loc=0, scale=(0.0, 0.1 * 255), per_channel=0.5),
+                    iaa.AdditiveLaplaceNoise(
+                        loc=0, scale=(0.0, 0.1 * 255), per_channel=0.5),
+                    iaa.AdditivePoissonNoise(lam=(0, 30), per_channel=0.5)
+                ])),
+            iaa.Sometimes(0.8, iaa.JpegCompression(compression=(40, 90))),
+            iaa.Resize(h),
+        ])
+
+        aug_img = aug_seq.augment_image(img)
+        return aug_img
+
+    def augment_color_temperature(self, img):
+        aug = iaa.ChangeColorTemperature((1000, 40000))
+
+        aug_img = aug.augment_image(img)
+        return aug_img
+
+    def aug_clr_noise_blur(self, img):
+        # skin&light
+        if np.random.choice((True, False), p=[0.05, 0.95]):
+            img_ycrcb_raw = cv2.cvtColor(img, cv2.COLOR_BGR2YCR_CB)
+            skin_factor_list = [0.6, 0.8, 1.0, 1.2, 1.4]
+            skin_factor = np.random.choice(skin_factor_list)
+            img_ycrcb_raw[:, :, 0:1] = np.clip(
+                img_ycrcb_raw[:, :, 0:1].astype(np.float) * skin_factor, 0,
+                255).astype(np.uint8)
+            img = cv2.cvtColor(img_ycrcb_raw, cv2.COLOR_YCR_CB2BGR)
+
+        # gauss blur 5%
+        if np.random.choice((True, False), p=[0.05, 0.95]):
+            sigma = np.random.choice([0.25, 0.50, 0.75])
+            gauss_blur_iaa = iaa.GaussianBlur(sigma=sigma)
+            img = gauss_blur_iaa(image=img)
+
+        # gauss noise 5%
+        if np.random.choice((True, False), p=[0.05, 0.95]):
+            scale = np.random.choice([0.01, 0.03, 0.05])
+            gauss_noise_iaa = iaa.AdditiveGaussianNoise(scale=scale * 255)
+            img = gauss_noise_iaa(image=img)
+
+        # motion blur 5%
+        if np.random.choice((True, False), p=[0.05, 0.95]):
+            angle = np.random.choice([0, 45, 90, 135, 180, 225, 270, 315])
+            motion_blur_iaa = iaa.MotionBlur(k=5, angle=angle)
+            img = motion_blur_iaa(image=img)
+
+        # jpeg compress 5%
+        if np.random.choice((True, False), p=[0.05, 0.95]):
+            jpeg_compress_iaa = iaa.JpegCompression(compression=(10, 50))
+            img = jpeg_compress_iaa(image=img)
+
+        # gamma contrast 5%
+        if np.random.choice((True, False), p=[0.05, 0.95]):
+            gamma_contrast_iaa = iaa.GammaContrast((0.85, 1.15))
+            img = gamma_contrast_iaa(image=img)
+
+        # brightness 5%
+        if np.random.choice((True, False), p=[0.05, 0.95]):
+            brightness_iaa = iaa.MultiplyAndAddToBrightness(
+                mul=(0.85, 1.15), add=(-10, 10))
+            img = brightness_iaa(image=img)
+
+        return img
+
+    def augment_set(self, img):
+        noisy_image = img.copy().astype(np.uint8)
+        if np.random.choice((True, False), p=[0.6, 0.4]):
+            aug = iaa.ChangeColorTemperature((1000, 40000))
+            noisy_image = aug.augment_image(noisy_image)
+
+        if np.random.choice((True, False), p=[0.8, 0.2]):
+            aug_seq = iaa.Sequential([
+                iaa.Sometimes(0.5, iaa.JpegCompression(compression=(40, 90))),
+                iaa.Sometimes(0.5, iaa.MotionBlur((3, 7))),
+                iaa.Sometimes(
+                    0.5,
+                    iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.05 * 255))),
+            ],
+                                     random_order=True)
+            noisy_image = aug_seq.augment_image(noisy_image)
+
+        sometimes = lambda aug: iaa.Sometimes(0.25, aug)
+        seq = iaa.Sequential([
+            sometimes(iaa.AverageBlur(k=(2, 5))),
+            sometimes(iaa.GammaContrast((0.5, 2.0)))
+        ],
+                             random_order=True)
+
+        noisy_image = seq(images=noisy_image)
+        return noisy_image
+
+
+@PIPELINES.register_module()
+class FaceKeypointNorm:
+    """Data augmentation with Norm.
+    """
+
+    def __init__(self, input_size=96):
+        self.input_size = input_size
+
+    def __call__(self, results):
+        """Perform data augmentation with random image flip."""
+
+        # for key in results.get('img', []):
+        if 'img' in results.keys():
+            image = results['img']
+            image = cv2.resize(image, (self.input_size, self.input_size))
+            results['img'] = np.array(image)
+
+            # for key in results.get('target_point', []):
+            if 'target_point' in results.keys():
+                points = results['target_point']
+                points[:, 0] = points[:, 0] / image.shape[1] * float(
+                    self.input_size)
+                points[:, 1] = points[:, 1] / image.shape[0] * float(
+                    self.input_size)
+                target_point = np.reshape(points,
+                                          (points.shape[0] * points.shape[1]))
+                results['target_point'] = np.array(target_point, np.float32)
+            else:
+                results['target_point'] = np.array(np.zeros(212), np.float32)
+
+            # for key in results.get('target_point_mask', []):
+            if 'target_point_mask' in results.keys():
+                points_mask = results['target_point_mask']
+                points_mask = points_mask.astype(np.float32)
+                points_mask = np.reshape(
+                    points_mask, (points_mask.shape[0] * points_mask.shape[1]))
+                results['target_point_mask'] = points_mask.astype(np.float32)
+            else:
+                results['target_point_mask'] = np.array(
+                    np.zeros(212), np.float32)
+
+            # for key in results.get('target_pose', []):
+            if 'target_pose' in results.keys():
+                pose = results['target_pose']
+                pose = np.asarray([pose['pitch'], pose['roll'], pose['yaw']])
+                results['target_pose'] = pose.astype(np.float32)
+            else:
+                results['target_pose'] = np.array(np.zeros(3), np.float32)
+
+            if 'target_pose_mask' not in results.keys():
+                results['target_pose_mask'] = np.array(np.zeros(3), np.float32)
+
+        return results
+
+
+@PIPELINES.register_module()
+class FaceKeypointRandomAugmentation:
+    """Data augmentation with random  flip.
+    """
+
+    def __init__(self, input_size=96):
+        self.input_size = input_size
+
+        # Data Augment
+        self.data_aug = FaceKeypointsDataAugumentation(self.input_size)
+
+    def __call__(self, results):
+        """Perform data augmentation with random image flip."""
+
+        image = results['img']
+        points = results['target_point']
+        points_mask = results['target_point_mask']
+        pose = results['target_pose']
+        pose_mask = results['target_pose_mask']
+        overlay_image_path = results['overlay_image_path']
+
+        if np.random.choice((True, False), p=[0.2, 0.8]):
+            # overlay
+            overlay_pos = self.data_aug.random_overlay()
+            overlay_img_index = np.random.choice(len(overlay_image_path))
+            overlay_img_filepath = overlay_image_path[overlay_img_index]
+            overlay_img = cv2.imread(overlay_img_filepath,
+                                     cv2.IMREAD_UNCHANGED)
+
+            (x, y, w, h) = overlay_pos
+            x1, y1, x2, y2 = x, y, x + w, y + h
+            overlay_img = cv2.resize(overlay_img, dsize=(w, h))
+            overlay_mask = overlay_img[:, :, 3:4] / 255.0
+            image[y1:y2, x1:x2, :] = image[y1:y2, x1:x2, :] * (
+                1 - overlay_mask) + overlay_img[:, :, 0:3] * overlay_mask
+            image = image.astype(np.uint8)
+
+        angle = pose['roll']
+        image, points, pose = self.data_aug.aug_rotate(
+            image, points, pose, angle)  # counterclockwise rotate angle
+        pose['roll'] = angle  # reset roll=angle
+
+        if np.random.choice((True, False)):
+            image_transform, points, _, pose = self.data_aug.aug_flip(
+                image, points, None, pose)
+        else:
+            image_transform = image
+
+        image_transform = self.data_aug.aug_clr_noise_blur(image_transform)
+
+        results['img'] = image_transform
+        results['target_point'] = points
+        results['target_pose'] = pose
+        return results
--- a/easycv/models/init.py
+++ b/easycv/models/init.py
@ -3,6 +3,7 @@ from .backbones import *  # noqa: F401,F403
 from .builder import build_backbone, build_head, build_loss, build_model
 from .classification import *
 from .detection import *
+from .face import *
 from .heads import *
 from .loss import *
 from .pose import TopDown
--- a/easycv/models/backbones/init.py
+++ b/easycv/models/backbones/init.py
@ -4,6 +4,7 @@ from .bninception import BNInception
 from .conv_mae_vit import FastConvMAEViT
 from .conv_vitdet import ConvViTDet
 from .efficientformer import EfficientFormer
+from .face_keypoint_backbone import FaceKeypointBackbone
 from .genet import PlainNet
 from .hrnet import HRNet
 from .inceptionv3 import Inception3
--- a/easycv/models/backbones/face_keypoint_backbone.py
+++ b/easycv/models/backbones/face_keypoint_backbone.py
@ -0,0 +1,90 @@
+import torch.nn as nn
+
+from easycv.models.registry import BACKBONES
+from easycv.models.utils.face_keypoint_utils import InvertedResidual, Residual
+
+
+@BACKBONES.register_module
+class FaceKeypointBackbone(nn.Module):
+
+    def __init__(self,
+                 in_channels=3,
+                 out_channels=48,
+                 residual_activation='relu',
+                 inverted_activation='half_v2',
+                 inverted_expand_ratio=2):
+        super(FaceKeypointBackbone, self).__init__()
+        self.conv1 = Residual(in_channels, 12, 3, 2, 0)
+        self.conv2 = Residual(12, 12, 3, 1, 0, activation=residual_activation)
+        self.conv3 = Residual(12, 12, 3, 1, 1, activation=residual_activation)
+        self.conv4 = Residual(12, 12, 3, 1, 0, activation=residual_activation)
+        self.conv5 = Residual(12, 24, 3, 2, 0, activation=residual_activation)
+        self.conv6 = Residual(24, 24, 3, 1, 0, activation=residual_activation)
+        self.conv7 = Residual(24, 24, 3, 1, 1, activation=residual_activation)
+        self.conv8 = Residual(24, 24, 3, 1, 1, activation=residual_activation)
+        self.conv9 = InvertedResidual(
+            24,
+            48,
+            3,
+            2,
+            0,
+            expand_ratio=inverted_expand_ratio,
+            activation=inverted_activation)
+        self.conv10 = InvertedResidual(
+            48,
+            48,
+            3,
+            1,
+            0,
+            expand_ratio=inverted_expand_ratio,
+            activation=inverted_activation)
+        self.conv11 = InvertedResidual(
+            48,
+            48,
+            3,
+            1,
+            1,
+            expand_ratio=inverted_expand_ratio,
+            activation=inverted_activation)
+        self.conv12 = InvertedResidual(
+            48,
+            48,
+            3,
+            1,
+            1,
+            expand_ratio=inverted_expand_ratio,
+            activation=inverted_activation)
+        self.conv13 = InvertedResidual(
+            48,
+            48,
+            3,
+            1,
+            1,
+            expand_ratio=inverted_expand_ratio,
+            activation=inverted_activation)
+        self.conv14 = InvertedResidual(
+            48,
+            out_channels,
+            3,
+            2,
+            0,
+            expand_ratio=inverted_expand_ratio,
+            activation=inverted_activation)
+
+    def forward(self, x):
+        x1 = self.conv1(x)
+        x2 = self.conv2(x1)
+        x3 = self.conv3(x2)
+        x4 = self.conv4(x3)
+        x5 = self.conv5(x4)
+        x6 = self.conv6(x5)
+        x7 = self.conv7(x6)
+        x8 = self.conv8(x7)
+        x9 = self.conv9(x8)
+        x10 = self.conv10(x9)
+        x11 = self.conv11(x10)
+        x12 = self.conv12(x11)
+        x13 = self.conv13(x12)
+        x14 = self.conv14(x13)
+
+        return x14
--- a/easycv/models/face/init.py
+++ b/easycv/models/face/init.py
@ -0,0 +1,2 @@
+from .face_keypoint import FaceKeypoint
+from .head import *
--- a/easycv/models/face/face_keypoint.py
+++ b/easycv/models/face/face_keypoint.py
@ -0,0 +1,103 @@
+import mmcv
+import numpy as np
+
+from easycv.models import builder
+from easycv.models.base import BaseModel
+from easycv.models.builder import MODELS
+from easycv.models.utils.face_keypoint_utils import (get_keypoint_accuracy,
+                                                     get_pose_accuracy)
+
+
+@MODELS.register_module()
+class FaceKeypoint(BaseModel):
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 keypoint_head=None,
+                 pose_head=None,
+                 pretrained=None,
+                 loss_keypoint=None,
+                 loss_pose=None):
+        super().__init__()
+        self.pretrained = pretrained
+
+        self.backbone = builder.build_backbone(backbone)
+
+        if neck is not None:
+            self.neck = builder.build_neck(neck)
+
+        if keypoint_head is not None:
+            if 'loss_keypoint' not in keypoint_head and loss_keypoint is not None:
+                keypoint_head['loss_keypoint'] = loss_keypoint
+            self.keypoint_head = builder.build_head(keypoint_head)
+
+        if pose_head is not None:
+            if 'loss_pose' not in pose_head and loss_pose is not None:
+                pose_head['loss_pose'] = loss_pose
+            self.pose_head = builder.build_head(pose_head)
+
+    @property
+    def with_neck(self):
+        """Check if has keypoint_head."""
+        return hasattr(self, 'neck')
+
+    @property
+    def with_keypoint(self):
+        """Check if has keypoint_head."""
+        return hasattr(self, 'keypoint_head')
+
+    @property
+    def with_pose(self):
+        """Check if has pose_head."""
+        return hasattr(self, 'pose_head')
+
+    def forward_train(self, img, target_point, target_point_mask, target_pose,
+                      target_pose_mask, **kwargs):
+        """Defines the computation performed at every call when training."""
+        output = self.backbone(img)
+
+        if self.with_neck:
+            output = self.neck(output)
+        if self.with_keypoint:
+            output_points = self.keypoint_head(output)
+        if self.with_pose:
+            output_pose = self.pose_head(output)
+
+        target_point = target_point * target_point_mask
+        target_pose = target_pose * target_pose_mask
+
+        losses = dict()
+        if self.with_keypoint:
+            keypoint_losses = self.keypoint_head.get_loss(
+                output_points, target_point, target_point_mask, target_pose)
+            losses.update(keypoint_losses)
+            keypoint_accuracy = get_keypoint_accuracy(output_points,
+                                                      target_point)
+            losses.update(keypoint_accuracy)
+
+        if self.with_pose:
+            output_pose = output_pose * 180.0 / np.pi
+            output_pose = output_pose * target_pose_mask
+
+            pose_losses = self.pose_head.get_loss(output_pose, target_pose)
+            losses.update(pose_losses)
+            pose_accuracy = get_pose_accuracy(output_pose, target_pose)
+            losses.update(pose_accuracy)
+        return losses
+
+    def forward_test(self, img, **kwargs):
+        """Defines the computation performed at every call when testing."""
+
+        output = self.backbone(img)
+        if self.with_neck:
+            output = self.neck(output)
+        if self.with_keypoint:
+            output_points = self.keypoint_head(output)
+        if self.with_pose:
+            output_pose = self.pose_head(output)
+
+        ret = {}
+        ret['point'] = output_points
+        ret['pose'] = output_pose
+        return ret
--- a/easycv/models/face/head/init.py
+++ b/easycv/models/face/head/init.py
@ -0,0 +1,2 @@
+from .face_keypoint_head import FaceKeypointHead
+from .face_keypoint_pose_head import FacePoseHead
--- a/easycv/models/face/head/face_keypoint_head.py
+++ b/easycv/models/face/head/face_keypoint_head.py
@ -0,0 +1,68 @@
+import copy
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from easycv.models.builder import HEADS, build_loss
+from easycv.models.utils.face_keypoint_utils import (InvertedResidual, View,
+                                                     conv_bn, conv_no_relu,
+                                                     get_keypoint_accuracy)
+
+
+@HEADS.register_module
+class FaceKeypointHead(nn.Module):
+
+    def __init__(
+        self,
+        mean_face,
+        loss_keypoint,
+        in_channels=48,
+        out_channels=212,
+        input_size=96,
+        inverted_expand_ratio=2,
+        inverted_activation='half_v2',
+    ):
+        super(FaceKeypointHead, self).__init__()
+        self.input_size = input_size
+        self.face_mean_shape = copy.deepcopy(np.asarray(mean_face))
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.branches = []
+
+        self.loss = build_loss(loss_keypoint)
+
+        # points
+        self.branches.append(
+            nn.Sequential(
+                InvertedResidual(
+                    in_channels,
+                    96,
+                    3,
+                    1,
+                    1,
+                    expand_ratio=inverted_expand_ratio,
+                    activation=inverted_activation),
+                View((-1, 96 * 3 * 3, 1, 1)), conv_bn(96 * 3 * 3, 128, 1, 1,
+                                                      0),
+                conv_bn(128, 128, 1, 1, 0),
+                conv_no_relu(128, out_channels, 1, 1, 0),
+                View((-1, out_channels))))
+        self.branches = nn.ModuleList(self.branches)
+
+    def get_loss(self, output, target_point, target_point_mask, target_pose):
+        losses = dict()
+        loss = self.loss(output * target_point_mask, target_point, target_pose)
+        losses['point_loss'] = loss
+
+        return losses
+
+    def get_accuracy(self, output, target_point):
+        return get_keypoint_accuracy(output, target_point)
+
+    def forward(self, x):
+        point = self.branches[0](x)
+        point = point * 0.5 + torch.from_numpy(self.face_mean_shape).to(
+            self.device)
+        point = point * self.input_size
+
+        return point
--- a/easycv/models/face/head/face_keypoint_pose_head.py
+++ b/easycv/models/face/head/face_keypoint_pose_head.py
@ -0,0 +1,55 @@
+import numpy as np
+import torch
+import torch.nn as nn
+
+from easycv.models.builder import HEADS, build_loss
+from easycv.models.utils.face_keypoint_utils import (InvertedResidual, View,
+                                                     conv_bn, conv_no_relu,
+                                                     get_pose_accuracy)
+
+
+@HEADS.register_module
+class FacePoseHead(nn.Module):
+
+    def __init__(
+        self,
+        loss_pose,
+        in_channels=48,
+        out_channels=3,
+        inverted_expand_ratio=2,
+        inverted_activation='half_v2',
+    ):
+        super(FacePoseHead, self).__init__()
+        self.branches = []
+
+        self.loss = build_loss(loss_pose)
+
+        # pose
+        self.branches.append(
+            nn.Sequential(
+                InvertedResidual(
+                    in_channels,
+                    48,
+                    3,
+                    1,
+                    1,
+                    expand_ratio=inverted_expand_ratio,
+                    activation=inverted_activation),
+                View((-1, 48 * 3 * 3, 1, 1)), conv_bn(48 * 3 * 3, 48, 1, 1, 0),
+                conv_bn(48, 48, 1, 1, 0),
+                conv_no_relu(48, out_channels, 1, 1, 0),
+                View((-1, out_channels))))
+        self.branches = nn.ModuleList(self.branches)
+
+    def get_loss(self, output, target_pose):
+        losses = dict()
+        loss = self.loss(output, target_pose)
+        losses['pose_loss'] = loss
+
+        return losses
+
+    def get_accuracy(self, output, target_pose):
+        return get_pose_accuracy(output, target_pose)
+
+    def forward(self, x):
+        return self.branches[0](x)
--- a/easycv/models/loss/init.py
+++ b/easycv/models/loss/init.py
@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from .cross_entropy_loss import CrossEntropyLoss
+from .face_keypoint_loss import FacePoseLoss, WingLossWithPose
 from .focal_loss import FocalLoss
 from .iou_loss import GIoULoss, IoULoss
 from .mse_loss import JointsMSELoss
--- a/easycv/models/loss/face_keypoint_loss.py
+++ b/easycv/models/loss/face_keypoint_loss.py
@ -0,0 +1,91 @@
+import copy
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from easycv.models.builder import LOSSES
+
+CONSTANT_CONTOUR = 66
+CONSTANT_EYEBROW = 18
+CONSTANT_EYE = 18
+CONSTANT_NOSE = 30
+CONSTANT_LIPS = 40
+CONSTANT_EYE_CENTER = 4
+
+
+@LOSSES.register_module()
+class WingLossWithPose(nn.Module):
+
+    def __init__(self,
+                 num_points=106,
+                 left_eye_left_corner_index=66,
+                 right_eye_right_corner_index=79,
+                 points_weight=1.0,
+                 contour_weight=1.5,
+                 eyebrow_weight=1.5,
+                 eye_weight=1.7,
+                 nose_weight=1.3,
+                 lip_weight=1.7,
+                 omega=10,
+                 epsilon=2):
+        super(WingLossWithPose, self).__init__()
+        self.omega = omega
+        self.epsilon = epsilon
+
+        self.num_points = num_points
+        self.left_eye_left_corner_index = left_eye_left_corner_index
+        self.right_eye_right_corner_index = right_eye_right_corner_index
+        self.points_weight = points_weight
+        contour_weight = np.full(CONSTANT_CONTOUR, contour_weight)
+        eyebrow_left_weight = np.full(CONSTANT_EYEBROW, eyebrow_weight)
+        eyebrow_right_weight = np.full(CONSTANT_EYEBROW, eyebrow_weight)
+        nose_weight = np.full(CONSTANT_NOSE, nose_weight)
+        eye_left_weight = np.full(CONSTANT_EYE, eye_weight)
+        eye_right_weight = np.full(CONSTANT_EYE, eye_weight)
+        lips_weight = np.full(CONSTANT_LIPS, lip_weight)
+        eye_center_weight = np.full(CONSTANT_EYE_CENTER, eye_weight)
+        part_weight = np.concatenate(
+            (contour_weight, eyebrow_left_weight, eyebrow_right_weight,
+             nose_weight, eye_left_weight, eye_right_weight, lips_weight,
+             eye_center_weight),
+            axis=0)
+
+        self.part_weight = None
+        if part_weight is not None:
+            self.part_weight = torch.from_numpy(part_weight).cuda()
+
+    def forward(self, pred, target, pose):
+        weight = 5.0 * (1.0 - torch.cos(pose * np.pi / 180.0)) + 1.0
+        weight = torch.sum(weight, dim=1) / 3.0
+        weight = weight.view((weight.shape[0], 1))
+
+        if self.part_weight is not None:
+            weight = weight * self.part_weight
+
+        y = target
+        y_hat = pred
+        delta_y = (y - y_hat).abs() * weight
+        delta_y1 = delta_y[delta_y < self.omega]
+        delta_y2 = delta_y[delta_y >= self.omega]
+        loss1 = self.omega * torch.log(1 + delta_y1 / self.epsilon)
+        C = self.omega - self.omega * math.log(1 + self.omega / self.epsilon)
+        loss = delta_y2 - C
+        result = self.points_weight * (loss1.sum() + loss.sum()) / (
+            len(loss1) + len(loss))
+
+        return result
+
+
+@LOSSES.register_module()
+class FacePoseLoss(nn.Module):
+
+    def __init__(self, pose_weight=1.0):
+        super(FacePoseLoss, self).__init__()
+        self.criterion = nn.MSELoss()
+        self.pose_weight = pose_weight
+
+    def forward(self, pred, target):
+        result = self.pose_weight * self.criterion(pred, target)
+        return result
--- a/easycv/models/utils/init.py
+++ b/easycv/models/utils/init.py
@ -5,6 +5,10 @@ from .conv_ws import ConvWS2d, conv_ws_2d
 from .dist_utils import (DistributedLossWrapper, DistributedMinerWrapper,
                         get_world_size, is_dist_avail_and_initialized,
                         reduce_mean)
+from .face_keypoint_utils import (ION, InvertedResidual, Residual, Softmax,
+                                  View, conv_bn, conv_no_relu,
+                                  get_keypoint_accuracy, get_pose_accuracy,
+                                  pose_accuracy)
 from .gather_layer import GatherLayer
 from .init_weights import _init_weights, trunc_normal_
 from .multi_pooling import GeMPooling, MultiAvgPooling, MultiPooling
--- a/easycv/models/utils/face_keypoint_utils.py
+++ b/easycv/models/utils/face_keypoint_utils.py
@ -0,0 +1,240 @@
+import copy
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+
+def conv_bn(inp, oup, kernel, stride, padding=1):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, kernel, stride, padding, bias=False),
+        nn.BatchNorm2d(oup), nn.PReLU(oup))
+
+
+def conv_no_relu(inp, oup, kernel, stride, padding=1):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, kernel, stride, padding, bias=False),
+        nn.BatchNorm2d(oup))
+
+
+class View(nn.Module):
+
+    def __init__(self, shape):
+        super(View, self).__init__()
+        self.shape = shape
+
+    def forward(self, x):
+        return x.view(*self.shape)
+
+
+class Softmax(nn.Module):
+
+    def __init__(self, dim):
+        super(Softmax, self).__init__()
+        self.softmax = nn.Softmax(dim)
+
+    def forward(self, x):
+        return self.softmax(x)
+
+
+class InvertedResidual(nn.Module):
+
+    def __init__(self,
+                 inp,
+                 oup,
+                 kernel_size,
+                 stride,
+                 padding,
+                 expand_ratio=2,
+                 use_connect=False,
+                 activation='relu'):
+        super(InvertedResidual, self).__init__()
+
+        hid_channels = int(inp * expand_ratio)
+        if activation == 'relu':
+            self.conv = nn.Sequential(
+                nn.Conv2d(inp, hid_channels, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(hid_channels), nn.ReLU(inplace=True),
+                nn.Conv2d(
+                    hid_channels,
+                    hid_channels,
+                    kernel_size,
+                    stride,
+                    padding,
+                    groups=hid_channels,
+                    bias=False), nn.BatchNorm2d(hid_channels),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(hid_channels, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup))
+        elif activation == 'prelu':
+            self.conv = nn.Sequential(
+                nn.Conv2d(inp, hid_channels, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(hid_channels), nn.PReLU(hid_channels),
+                nn.Conv2d(
+                    hid_channels,
+                    hid_channels,
+                    kernel_size,
+                    stride,
+                    padding,
+                    groups=hid_channels,
+                    bias=False), nn.BatchNorm2d(hid_channels),
+                nn.PReLU(hid_channels),
+                nn.Conv2d(hid_channels, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup))
+        elif activation == 'half_v1':
+            self.conv = nn.Sequential(
+                nn.Conv2d(inp, hid_channels, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(hid_channels), nn.ReLU(inplace=True),
+                nn.Conv2d(
+                    hid_channels,
+                    hid_channels,
+                    kernel_size,
+                    stride,
+                    padding,
+                    groups=hid_channels,
+                    bias=False), nn.BatchNorm2d(hid_channels),
+                nn.PReLU(hid_channels),
+                nn.Conv2d(hid_channels, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup))
+        elif activation == 'half_v2':
+            self.conv = nn.Sequential(
+                nn.Conv2d(inp, hid_channels, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(hid_channels), nn.PReLU(hid_channels),
+                nn.Conv2d(
+                    hid_channels,
+                    hid_channels,
+                    kernel_size,
+                    stride,
+                    padding,
+                    groups=hid_channels,
+                    bias=False), nn.BatchNorm2d(hid_channels),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(hid_channels, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup))
+        self.use_connect = use_connect
+
+    def forward(self, x):
+        if self.use_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+class Residual(nn.Module):
+
+    def __init__(self,
+                 inp,
+                 oup,
+                 kernel_size,
+                 stride,
+                 padding,
+                 use_connect=False,
+                 activation='relu'):
+        super(Residual, self).__init__()
+
+        self.use_connect = use_connect
+
+        if activation == 'relu':
+            self.conv = nn.Sequential(
+                nn.Conv2d(
+                    inp,
+                    inp,
+                    kernel_size,
+                    stride,
+                    padding,
+                    groups=inp,
+                    bias=False), nn.BatchNorm2d(inp), nn.ReLU(inplace=True),
+                nn.Conv2d(inp, oup, 1, 1, 0, bias=False), nn.BatchNorm2d(oup),
+                nn.ReLU(inplace=True))
+        elif activation == 'prelu':
+            self.conv = nn.Sequential(
+                nn.Conv2d(
+                    inp,
+                    inp,
+                    kernel_size,
+                    stride,
+                    padding,
+                    groups=inp,
+                    bias=False), nn.BatchNorm2d(inp), nn.PReLU(inp),
+                nn.Conv2d(inp, oup, 1, 1, 0, bias=False), nn.BatchNorm2d(oup),
+                nn.PReLU(oup))
+        elif activation == 'half_v1':
+            self.conv = nn.Sequential(
+                nn.Conv2d(
+                    inp,
+                    inp,
+                    kernel_size,
+                    stride,
+                    padding,
+                    groups=inp,
+                    bias=False), nn.BatchNorm2d(inp), nn.ReLU(inplace=True),
+                nn.Conv2d(inp, oup, 1, 1, 0, bias=False), nn.BatchNorm2d(oup),
+                nn.PReLU(oup))
+        elif activation == 'half_v2':
+            self.conv = nn.Sequential(
+                nn.Conv2d(
+                    inp,
+                    inp,
+                    kernel_size,
+                    stride,
+                    padding,
+                    groups=inp,
+                    bias=False), nn.BatchNorm2d(inp), nn.PReLU(inp),
+                nn.Conv2d(inp, oup, 1, 1, 0, bias=False), nn.BatchNorm2d(oup),
+                nn.ReLU(inplace=True))
+
+    def forward(self, x):
+        if self.use_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+def pose_accuracy(output, target):
+    with torch.no_grad():
+        output = output.detach().cpu().numpy()
+        target = target.detach().cpu().numpy()
+
+        acc = np.mean(np.abs(output - target))
+        return acc
+
+
+def ION(output, target, left_eye_left_coner_idx, right_eye_right_corner_idx,
+        num_pts):
+    with torch.no_grad():
+        output = output.view(-1, num_pts, 2).cpu().numpy()
+        target = target.view(-1, num_pts, 2).cpu().numpy()
+
+        interocular = target[:,
+                             left_eye_left_coner_idx] - target[:,
+                                                               right_eye_right_corner_idx]
+        interocular = np.sqrt(
+            np.square(interocular[:, 0]) + np.square(interocular[:, 1])) + 1e-5
+        dist = target - output
+        dist = np.sqrt(np.square(dist[:, :, 0]) + np.square(dist[:, :, 1]))
+        dist = np.sum(dist, axis=1)
+        nme = dist / (interocular * num_pts)
+
+    return np.mean(nme)
+
+
+def get_keypoint_accuracy(output, target_point):
+    accuracy = dict()
+    num_points = 106
+    left_eye_left_corner_index = 66
+    right_eye_right_corner_index = 79
+
+    nme = ION(output, target_point, left_eye_left_corner_index,
+              right_eye_right_corner_index, num_points)
+
+    accuracy['nme'] = nme
+
+    return accuracy
+
+
+def get_pose_accuracy(output, target_pose):
+    accuracy = dict()
+    pose_acc = pose_accuracy(output, target_pose)
+    accuracy['pose_acc'] = float(pose_acc)
+    return accuracy
--- a/easycv/predictors/init.py
+++ b/easycv/predictors/init.py
@ -2,6 +2,7 @@
 from .classifier import TorchClassifier
 from .detector import (TorchFaceDetector, TorchYoloXClassifierPredictor,
                       TorchYoloXPredictor)
+from .face_keypoints_predictor import FaceKeypointsPredictor
 from .feature_extractor import (TorchFaceAttrExtractor,
                                TorchFaceFeatureExtractor,
                                TorchFeatureExtractor)
--- a/easycv/predictors/base.py
+++ b/easycv/predictors/base.py
@ -113,6 +113,7 @@ class PredictorV2(object):
                 device=None,
                 save_results=False,
                 save_path=None,
+                 mode='rgb',
                 *args,
                 **kwargs):
        self.model_path = model_path
@ -135,6 +136,7 @@ class PredictorV2(object):
        self.model = self.prepare_model()
        self.processor = self.build_processor()
        self._load_op = None
+        self.mode = mode

    def prepare_model(self):
        """Build model from config file by default.
@ -182,7 +184,7 @@ class PredictorV2(object):
            }
        """
        if self._load_op is None:
-            load_cfg = dict(type='LoadImage', mode='rgb')
+            load_cfg = dict(type='LoadImage', mode=self.mode)
            self._load_op = build_from_cfg(load_cfg, PIPELINES)

        if not isinstance(input, str):
--- a/easycv/predictors/face_keypoints_predictor.py
+++ b/easycv/predictors/face_keypoints_predictor.py
@ -0,0 +1,120 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import copy
+import os
+
+import cv2
+import numpy as np
+import torch
+from torchvision.transforms import Compose
+
+from easycv.datasets.registry import PIPELINES
+from easycv.models import build_model
+from easycv.predictors.builder import PREDICTORS
+from easycv.predictors.interface import PredictorInterface
+from easycv.utils.checkpoint import load_checkpoint
+from easycv.utils.config_tools import mmcv_config_fromfile
+from easycv.utils.registry import build_from_cfg
+from ..models import *
+from .base import PredictorV2
+
+face_contour_point_index = [
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+    21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+]
+left_eye_brow_point_index = [33, 34, 35, 36, 37, 38, 39, 40, 41, 33]
+right_eye_brow_point_index = [42, 43, 44, 45, 46, 47, 48, 49, 50, 42]
+left_eye_point_index = [66, 67, 68, 69, 70, 71, 72, 73, 66]
+right_eye_point_index = [75, 76, 77, 78, 79, 80, 81, 82, 75]
+nose_bridge_point_index = [51, 52, 53, 54]
+nose_contour_point_index = [55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65]
+mouth_outer_point_index = [84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 84]
+mouth_inter_point_index = [96, 97, 98, 99, 100, 101, 102, 103, 96]
+
+
+@PREDICTORS.register_module()
+class FaceKeypointsPredictor(PredictorV2):
+    """Predict pipeline for face keypoint
+    Args:
+        model_path (str): Path of model path
+        model_config (str): config file path for model and processor to init. Defaults to None.
+    """
+
+    def __init__(self,
+                 model_path,
+                 model_config,
+                 batch_size=1,
+                 device=None,
+                 save_results=False,
+                 save_path=None,
+                 mode='bgr'):
+        super(FaceKeypointsPredictor, self).__init__(
+            model_path,
+            model_config,
+            batch_size=batch_size,
+            device=device,
+            save_results=save_results,
+            save_path=save_path,
+            mode=mode)
+
+        self.input_size = self.cfg.IMAGE_SIZE
+        self.point_number = self.cfg.POINT_NUMBER
+
+    def show_result(self, img, points, scale=4.0, save_path=None):
+        """Draw `result` over `img`.
+
+        Args:
+            img (str or Tensor): The image to be displayed.
+            result (Tensor): The face keypoints to draw over `img`.
+            scale: zoom in or out scale
+            save_path: path to save drawned 'img'
+        Returns:
+            img (Tensor): Only if not `show` or `out_file`
+        """
+
+        img = cv2.imread(img)
+        img = img.copy()
+        h, w, c = img.shape
+        scale_h = h / self.input_size
+        scale_w = w / self.input_size
+
+        points = points.view(-1, self.point_number, 2).cpu().numpy()[0]
+        for index in range(len(points)):
+            points[index][0] *= scale_w
+            points[index][1] *= scale_h
+
+        image = cv2.resize(img, dsize=None, fx=scale, fy=scale)
+
+        def draw_line(point_index, image, point):
+            for i in range(len(point_index) - 1):
+                cur_index = point_index[i]
+                next_index = point_index[i + 1]
+                cur_pt = (int(point[cur_index][0] * scale),
+                          int(point[cur_index][1] * scale))
+                next_pt = (int(point[next_index][0] * scale),
+                           int(point[next_index][1] * scale))
+                cv2.line(image, cur_pt, next_pt, (0, 0, 255), thickness=2)
+
+        draw_line(face_contour_point_index, image, points)
+        draw_line(left_eye_brow_point_index, image, points)
+        draw_line(right_eye_brow_point_index, image, points)
+        draw_line(left_eye_point_index, image, points)
+        draw_line(right_eye_point_index, image, points)
+        draw_line(nose_bridge_point_index, image, points)
+        draw_line(nose_contour_point_index, image, points)
+        draw_line(mouth_outer_point_index, image, points)
+        draw_line(mouth_inter_point_index, image, points)
+
+        size = len(points)
+        for i in range(size):
+            x = int(points[i][0])
+            y = int(points[i][1])
+            cv2.putText(image, str(i), (int(x * scale), int(y * scale)),
+                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
+            cv2.circle(image, (int(x * scale), int(y * scale)), 2, (0, 255, 0),
+                       cv2.FILLED)
+
+        if save_path is not None:
+            cv2.imwrite(save_path, image)
+
+        return image
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@ -3,6 +3,7 @@ dataclasses
 einops
 future
 h5py
+imgaug
 json_tricks
 numpy
 opencv-python
--- a/tests/predictors/test_face_keypoints_predictor.py
+++ b/tests/predictors/test_face_keypoints_predictor.py
@ -0,0 +1,39 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import copy
+import os
+import tempfile
+import unittest
+
+import cv2
+import numpy as np
+from PIL import Image
+
+from easycv.predictors.face_keypoints_predictor import FaceKeypointsPredictor
+
+
+class FaceKeypointsPredictorWithoutDetectorTest(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.image_path = './data/test/face_2d_keypoints/data/002253.png'
+        self.save_image_path = './data/test/face_2d_keypoints/data/result_002253.png'
+        self.model_path = './data/test/face_2d_keypoints/models/epoch_580.pth'
+        self.model_config_path = './configs/face/face_96x96_wingloss.py'
+
+    def test_single(self):
+        predict_pipeline = FaceKeypointsPredictor(
+            model_path=self.model_path, model_config=self.model_config_path)
+
+        output = predict_pipeline(self.image_path)[0]
+        output_keypoints = output['point']
+        output_pose = output['pose']
+        image_show = predict_pipeline.show_result(
+            self.image_path,
+            output_keypoints,
+            scale=2,
+            save_path=self.save_image_path)
+
+
+if __name__ == '__main__':
+    unittest.main()