diff --git a/mmpretrain/datasets/scienceqa.py b/mmpretrain/datasets/scienceqa.py
index 391f7e1ac..968264486 100644
--- a/mmpretrain/datasets/scienceqa.py
+++ b/mmpretrain/datasets/scienceqa.py
@@ -62,42 +62,45 @@ class ScienceQA(BaseDataset):
         data_list = []
         for data_id in current_data_split:
             ann = annotations[data_id]
-            if ann['image'] is not None:
-                data_info = {
-                    'image_id':
-                    data_id,
-                    'question':
-                    ann['question'],
-                    'choices':
-                    ann['choices'],
-                    'answer':
-                    ann['answer'],
-                    'hint':
-                    ann['hint'],
-                    'image_name':
-                    ann['image'],
-                    'task':
-                    ann['task'],
-                    'grade':
-                    ann['grade'],
-                    'subject':
-                    ann['subject'],
-                    'topic':
-                    ann['topic'],
-                    'category':
-                    ann['category'],
-                    'skill':
-                    ann['skill'],
-                    'lecture':
-                    ann['lecture'],
-                    'solution':
-                    ann['solution'],
-                    'split':
-                    ann['split'],
-                    'img_path':
-                    file_backend.join_path(img_prefix, data_id,
-                                           ann['image']),  # noqa
-                }
-                data_list.append(data_info)
+            data_info = {
+                'image_id':
+                data_id,
+                'question':
+                ann['question'],
+                'choices':
+                ann['choices'],
+                'gt_answer':
+                ann['answer'],
+                'hint':
+                ann['hint'],
+                'image_name':
+                ann['image'],
+                'task':
+                ann['task'],
+                'grade':
+                ann['grade'],
+                'subject':
+                ann['subject'],
+                'topic':
+                ann['topic'],
+                'category':
+                ann['category'],
+                'skill':
+                ann['skill'],
+                'lecture':
+                ann['lecture'],
+                'solution':
+                ann['solution'],
+                'split':
+                ann['split'],
+                'img_path':
+                file_backend.join_path(img_prefix, data_id, ann['image'])
+                if ann['image'] is not None else None,
+                'caption':
+                ann['caption'],
+                'has_image':
+                True if ann['image'] is not None else False,
+            }
+            data_list.append(data_info)
 
         return data_list
diff --git a/mmpretrain/evaluation/metrics/__init__.py b/mmpretrain/evaluation/metrics/__init__.py
index 683cf72be..186cdd9f2 100644
--- a/mmpretrain/evaluation/metrics/__init__.py
+++ b/mmpretrain/evaluation/metrics/__init__.py
@@ -3,6 +3,7 @@ from .caption import COCOCaption
 from .multi_label import AveragePrecision, MultiLabelMetric
 from .multi_task import MultiTasksMetric
 from .retrieval import RetrievalRecall
+from .scienceqa import ScienceQAMetric
 from .single_label import Accuracy, ConfusionMatrix, SingleLabelMetric
 from .visual_grounding_eval import VisualGroundingMetric
 from .voc_multi_label import VOCAveragePrecision, VOCMultiLabelMetric
@@ -12,5 +13,5 @@ __all__ = [
     'Accuracy', 'SingleLabelMetric', 'MultiLabelMetric', 'AveragePrecision',
     'MultiTasksMetric', 'VOCAveragePrecision', 'VOCMultiLabelMetric',
     'ConfusionMatrix', 'RetrievalRecall', 'VQAAcc', 'ReportVQA', 'COCOCaption',
-    'VisualGroundingMetric'
+    'VisualGroundingMetric', 'ScienceQAMetric'
 ]
diff --git a/mmpretrain/evaluation/metrics/scienceqa.py b/mmpretrain/evaluation/metrics/scienceqa.py
new file mode 100644
index 000000000..90a392042
--- /dev/null
+++ b/mmpretrain/evaluation/metrics/scienceqa.py
@@ -0,0 +1,170 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+from typing import List, Optional
+
+from mmengine.evaluator import BaseMetric
+
+from mmpretrain.registry import METRICS
+
+
+def get_pred_idx(prediction: str, choices: List[str],
+                 options: List[str]) -> int:  # noqa
+    """Get the index (e.g. 2) from the prediction (e.g. 'C')
+
+    Args:
+        prediction (str): The prediction from the model,
+            from ['A', 'B', 'C', 'D', 'E']
+        choices (List(str)): The choices for the question,
+            from ['A', 'B', 'C', 'D', 'E']
+        options (List(str)): The options for the question,
+            from ['A', 'B', 'C', 'D', 'E']
+
+    Returns:
+        int: The index of the prediction, from [0, 1, 2, 3, 4]
+    """
+    if prediction in options[:len(choices)]:
+        return options.index(prediction)
+    else:
+        return random.choice(range(len(choices)))
+
+
+@METRICS.register_module()
+class ScienceQAMetric(BaseMetric):
+    """Evaluation Metric for ScienceQA.
+
+    Args:
+        options (List(str)): Options for each question. Defaults to
+            ["A", "B", "C", "D", "E"].
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Should be modified according to the
+            `retrieval_type` for unambiguous results. Defaults to TR.
+    """
+
+    def __init__(self,
+                 options: List[str] = ['A', 'B', 'C', 'D', 'E'],
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.options = options
+
+    def process(self, data_batch, data_samples) -> None:
+        """Process one batch of data samples.
+
+        data_samples should contain the following keys:
+        1. pred_answer (str): The prediction from the model,
+            from ['A', 'B', 'C', 'D', 'E']
+        2. choices (List(str)): The choices for the question,
+            from ['A', 'B', 'C', 'D', 'E']
+        3. grade (int): The grade for the question, from grade1 to grade12
+        4. subject (str): The subject for the question, from
+            ['natural science', 'social science', 'language science']
+        5. answer (str): The answer for the question, from
+            ['A', 'B', 'C', 'D', 'E']
+        6. hint (str): The hint for the question
+        7. has_image (bool): Whether or not the question has image
+
+
+        The processed results should be stored in ``self.results``, which will
+        be used to computed the metrics when all batches have been processed.
+
+        Args:
+            data_batch: A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        for data_sample in data_samples:
+            result = dict()
+            choices = data_sample.get('choices')
+            result['prediction'] = get_pred_idx(
+                data_sample.get('pred_answer'), choices, self.options)
+            result['grade'] = data_sample.get('grade')
+            result['subject'] = data_sample.get('subject')
+            result['answer'] = self.options[data_sample.get('gt_answer')]
+            hint = data_sample.get('hint')
+            has_image = data_sample.get('has_image', False)
+            result[
+                'no_context'] = True if not has_image and hint is None else False  # noqa
+            result['has_text'] = True if hint is not None else False
+            result['has_image'] = has_image
+
+            # Save the result to `self.results`.
+            self.results.append(result)
+
+    def compute_metrics(self, results: List) -> dict:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (dict): The processed results of each batch.
+
+        Returns:
+            Dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        # NOTICE: don't access `self.results` from the method.
+        metrics = dict()
+
+        all_acc = []
+        acc_natural = []
+        acc_social = []
+        acc_language = []
+        acc_has_text = []
+        acc_has_image = []
+        acc_no_context = []
+        acc_grade_1_6 = []
+        acc_grade_7_12 = []
+
+        for result in results:
+            correct = result['prediction'] == result['answer']
+            all_acc.append(correct)
+            # different subjects
+            if result['subject'] == 'natural science':
+                acc_natural.append(correct)
+            elif result['subject'] == 'social science':
+                acc_social.append(correct)
+            elif result['subject'] == 'language science':
+                acc_language.append(correct)
+
+            # different context
+            if result['has_text']:
+                acc_has_text.append(correct)
+            elif result['has_image']:
+                acc_has_image.append(correct)
+            elif result['no_context']:
+                acc_no_context.append(correct)
+
+            # different grade
+            if result['grade'] in [
+                    'grade1', 'grade2', 'grade3', 'grade4', 'grade5', 'grade6'
+            ]:
+                acc_grade_1_6.append(correct)
+            elif result['grade'] in [
+                    'grade7', 'grade8', 'grade9', 'grade10', 'grade11',
+                    'grade12'
+            ]:
+                acc_grade_7_12.append(correct)
+
+        metrics['all_acc'] = sum(all_acc) / len(all_acc)
+        if len(acc_natural) > 0:
+            metrics['acc_natural'] = sum(acc_natural) / len(acc_natural)
+        if len(acc_social) > 0:
+            metrics['acc_social'] = sum(acc_social) / len(acc_social)
+        if len(acc_language) > 0:
+            metrics['acc_language'] = sum(acc_language) / len(acc_language)
+        if len(acc_has_text) > 0:
+            metrics['acc_has_text'] = sum(acc_has_text) / len(acc_has_text)
+        if len(acc_has_image) > 0:
+            metrics['acc_has_image'] = sum(acc_has_image) / len(acc_has_image)
+        if len(acc_no_context) > 0:
+            metrics['acc_no_context'] = sum(acc_no_context) / len(
+                acc_no_context)
+        if len(acc_grade_1_6) > 0:
+            metrics['acc_grade_1_6'] = sum(acc_grade_1_6) / len(acc_grade_1_6)
+        if len(acc_grade_7_12) > 0:
+            metrics['acc_grade_7_12'] = sum(acc_grade_7_12) / len(
+                acc_grade_7_12)
+
+        return metrics
diff --git a/tests/test_evaluation/test_metrics/test_scienceqa.py b/tests/test_evaluation/test_metrics/test_scienceqa.py
new file mode 100644
index 000000000..5df50aa30
--- /dev/null
+++ b/tests/test_evaluation/test_metrics/test_scienceqa.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.evaluator import Evaluator
+
+from mmpretrain.structures import DataSample
+
+
+class TestScienceQAMetric:
+
+    def test_evaluate(self):
+        meta_info = {
+            'choices': ['A', 'B', 'C', 'D'],
+            'pred_answer': 'A',
+            'grade': 'grade1',
+            'subject': 'language science',
+            'gt_answer': 1,
+            'hint': 'hint',
+            'has_image': True
+        }
+        data_sample = DataSample(metainfo=meta_info)
+        data_samples = [data_sample for _ in range(10)]
+        evaluator = Evaluator(dict(type='mmpretrain.ScienceQAMetric'))
+        evaluator.process(data_samples)
+        res = evaluator.evaluate(4)
+        assert res['acc_grade_1_6'] == 0.0
+        assert res['acc_language'] == 0.0
+        assert res['all_acc'] == 0.0