diff --git a/README.md b/README.md
index bdbb7ec7..b388e353 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,7 @@
 # OpenSelfSup
 
 **News**
+* OpenSelfSup now supports [Mixed Precision Training (apex AMP)](https://github.com/NVIDIA/apex)!
 * A bug of MoCo v2 has been fixed and now the results are reproducible.
 * OpenSelfSup now supports [BYOL](https://arxiv.org/pdf/2006.07733.pdf)!
 
diff --git a/openselfsup/apis/train.py b/openselfsup/apis/train.py
index f2088f6b..85c00ad0 100644
--- a/openselfsup/apis/train.py
+++ b/openselfsup/apis/train.py
@@ -184,13 +184,13 @@ def _dist_train(model, dataset, cfg, logger=None, timestamp=None, meta=None):
             drop_last=getattr(cfg.data, 'drop_last', False)) for ds in dataset
     ]
     optimizer = build_optimizer(model, cfg.optimizer)
-    if 'use_fp16' in cfg and cfg.use_fp16 == True:
+    if 'use_fp16' in cfg and cfg.use_fp16:
         model, optimizer = apex.amp.initialize(model.cuda(), optimizer, opt_level="O1")
         print_log('**** Initializing mixed precision done. ****')
 
     # put model on gpus
     model = MMDistributedDataParallel(
-        model.cuda(),
+        model if next(model.parameters()).is_cuda else model.cuda(),
         device_ids=[torch.cuda.current_device()],
         broadcast_buffers=False)