diff --git a/ppcls/arch/__init__.py b/ppcls/arch/__init__.py
index 0c45cf6fc..2d5e29db8 100644
--- a/ppcls/arch/__init__.py
+++ b/ppcls/arch/__init__.py
@@ -28,7 +28,6 @@ from ppcls.utils import logger
 from ppcls.utils.save_load import load_dygraph_pretrain
 from ppcls.arch.slim import prune_model, quantize_model
 
-
 __all__ = ["build_model", "RecModel", "DistillationModel"]
 
 
@@ -82,13 +81,11 @@ class RecModel(TheseusLayer):
         out["backbone"] = x
         if self.neck is not None:
             x = self.neck(x)
+            out["neck"] = x
         out["features"] = x
         if self.head is not None:
             y = self.head(x, label)
-            out["neck"] = x
-        else:
-            y = None
-        out["logits"] = y
+            out["logits"] = y
         return out
 
 
diff --git a/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_dml.yaml b/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_dml.yaml
index c8973b064..b6c45363b 100644
--- a/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_dml.yaml
+++ b/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_dml.yaml
@@ -1,5 +1,4 @@
 # global configs
-# global configs
 Global:
   checkpoints: null
   pretrained_model: null
@@ -85,11 +84,6 @@ Loss:
         key: "logits"
         model_name_pairs:
         - ["Student", "Teacher"]
-    - DistillationDMLLoss:
-        weight: 1.0
-        key: "logits"
-        model_name_pairs:
-        - ["Student", "Teacher"]
   Eval:
     - DistillationGTCELoss:
         weight: 1.0
diff --git a/ppcls/configs/ImageNet/Distillation/mv3_large_x1_0_distill_mv3_small_x1_0.yaml b/ppcls/configs/ImageNet/Distillation/mv3_large_x1_0_distill_mv3_small_x1_0.yaml
index e7147694c..d67704e09 100644
--- a/ppcls/configs/ImageNet/Distillation/mv3_large_x1_0_distill_mv3_small_x1_0.yaml
+++ b/ppcls/configs/ImageNet/Distillation/mv3_large_x1_0_distill_mv3_small_x1_0.yaml
@@ -57,7 +57,7 @@ Optimizer:
   momentum: 0.9
   lr:
     name: Cosine
-    learning_rate: 1.3
+    learning_rate: 0.65
     warmup_epoch: 5
   regularizer:
     name: 'L2'
diff --git a/ppcls/loss/distillationloss.py b/ppcls/loss/distillationloss.py
index ab6187f5a..0340234b9 100644
--- a/ppcls/loss/distillationloss.py
+++ b/ppcls/loss/distillationloss.py
@@ -69,7 +69,7 @@ class DistillationGTCELoss(CELoss):
 
     def forward(self, predicts, batch):
         loss_dict = dict()
-        for _, name in enumerate(self.model_names):
+        for name in self.model_names:
             out = predicts[name]
             if self.key is not None:
                 out = out[self.key]
diff --git a/ppcls/loss/dmlloss.py b/ppcls/loss/dmlloss.py
index 16ea76467..48bf6c024 100644
--- a/ppcls/loss/dmlloss.py
+++ b/ppcls/loss/dmlloss.py
@@ -42,8 +42,8 @@ class DMLLoss(nn.Layer):
 
     def forward(self, x, target):
         if self.act is not None:
-            x = F.softmax(x)
-            target = F.softmax(target)
+            x = self.act(x)
+            target = self.act(target)
         loss = self._kldiv(x, target) + self._kldiv(target, x)
         loss = loss / 2
         loss = paddle.mean(loss)