PaddleOCR/ppocr/utils/network.py

# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import time
import shutil
import tarfile
import requests
import os.path as osp
import paddle.distributed as dist
from tqdm import tqdm

from ppocr.utils.logging import get_logger

MODELS_DIR = os.path.expanduser("~/.paddleocr/models/")
DOWNLOAD_RETRY_LIMIT = 3


def download_with_progressbar(url, save_path):
    logger = get_logger()
    if save_path and os.path.exists(save_path):
        logger.info(f"Path {save_path} already exists. Skipping...")
        return
    else:
        # Mainly used to solve the problem of downloading data from different
        # machines in the case of multiple machines. Different nodes will download
        # data, and the same node will only download data once.
        if dist.get_rank() == 0:
            _download(url, save_path)
        else:
            while not os.path.exists(save_path):
                time.sleep(1)


def _download(url, save_path):
    """
    Download from url, save to path.

    url (str): download url
    save_path (str): download to given path
    """
    logger = get_logger()

    fname = osp.split(url)[-1]
    retry_cnt = 0

    while not osp.exists(save_path):
        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
            retry_cnt += 1
        else:
            raise RuntimeError(
                "Download from {} failed. " "Retry limit reached".format(url)
            )

        try:
            req = requests.get(url, stream=True)
        except Exception as e:  # requests.exceptions.ConnectionError
            logger.info(
                "Downloading {} from {} failed {} times with exception {}".format(
                    fname, url, retry_cnt + 1, str(e)
                )
            )
            time.sleep(1)
            continue

        if req.status_code != 200:
            raise RuntimeError(
                "Downloading from {} failed with code "
                "{}!".format(url, req.status_code)
            )

        # For protecting download interupted, download to
        # tmp_file firstly, move tmp_file to save_path
        # after download finished
        tmp_file = save_path + ".tmp"
        total_size = req.headers.get("content-length")
        with open(tmp_file, "wb") as f:
            if total_size:
                with tqdm(total=(int(total_size) + 1023) // 1024) as pbar:
                    for chunk in req.iter_content(chunk_size=1024):
                        f.write(chunk)
                        pbar.update(1)
            else:
                for chunk in req.iter_content(chunk_size=1024):
                    if chunk:
                        f.write(chunk)
        shutil.move(tmp_file, save_path)

    return save_path


def maybe_download(model_storage_directory, url):
    # using custom model
    tar_file_name_list = [".pdiparams", ".pdiparams.info", ".pdmodel"]
    if not os.path.exists(
        os.path.join(model_storage_directory, "inference.pdiparams")
    ) or not os.path.exists(os.path.join(model_storage_directory, "inference.pdmodel")):
        assert url.endswith(".tar"), "Only supports tar compressed package"
        tmp_path = os.path.join(model_storage_directory, url.split("/")[-1])
        print("download {} to {}".format(url, tmp_path))
        os.makedirs(model_storage_directory, exist_ok=True)
        download_with_progressbar(url, tmp_path)
        with tarfile.open(tmp_path, "r") as tarObj:
            for member in tarObj.getmembers():
                filename = None
                for tar_file_name in tar_file_name_list:
                    if member.name.endswith(tar_file_name):
                        filename = "inference" + tar_file_name
                if filename is None:
                    continue
                file = tarObj.extractfile(member)
                with open(os.path.join(model_storage_directory, filename), "wb") as f:
                    f.write(file.read())
        os.remove(tmp_path)


def maybe_download_params(model_path):
    if os.path.exists(model_path) or not is_link(model_path):
        return model_path
    else:
        url = model_path
    tmp_path = os.path.join(MODELS_DIR, url.split("/")[-1])
    print("download {} to {}".format(url, tmp_path))
    os.makedirs(MODELS_DIR, exist_ok=True)
    download_with_progressbar(url, tmp_path)
    return tmp_path


def is_link(s):
    return s is not None and s.startswith("http")


def confirm_model_dir_url(model_dir, default_model_dir, default_url):
    url = default_url
    if model_dir is None or is_link(model_dir):
        if is_link(model_dir):
            url = model_dir
        file_name = url.split("/")[-1][:-4]
        model_dir = default_model_dir
        model_dir = os.path.join(model_dir, file_name)
    return model_dir, url
merge dygraph 2021-06-10 14:24:59 +08:00			`# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`import os`
			`import sys`
fix download bug when use multi gpus (#13610) 2024-08-06 21:15:52 +08:00			`import time`
			`import shutil`
merge dygraph 2021-06-10 14:24:59 +08:00			`import tarfile`
			`import requests`
fix download bug when use multi gpus (#13610) 2024-08-06 21:15:52 +08:00			`import os.path as osp`
			`import paddle.distributed as dist`
merge dygraph 2021-06-10 14:24:59 +08:00			`from tqdm import tqdm`

			`from ppocr.utils.logging import get_logger`

add d2s train for slanet and v3 (#9341) * add d2s train for slanet and v3 * fix bug 2023-03-09 11:21:34 +08:00			`MODELS_DIR = os.path.expanduser("~/.paddleocr/models/")`
fix download bug when use multi gpus (#13610) 2024-08-06 21:15:52 +08:00			`DOWNLOAD_RETRY_LIMIT = 3`
add d2s train for slanet and v3 (#9341) * add d2s train for slanet and v3 * fix bug 2023-03-09 11:21:34 +08:00
merge dygraph 2021-06-10 14:24:59 +08:00
			`def download_with_progressbar(url, save_path):`
			`logger = get_logger()`
fix the issue of repeatedly downloading pretrained model (#12142) * fix the issue of repeatedly downloading pretrained model * add log info 2024-05-20 19:22:45 +08:00			`if save_path and os.path.exists(save_path):`
			`logger.info(f"Path {save_path} already exists. Skipping...")`
			`return`
add desc of version params (#3929) * add desc of version params 2021-11-10 20:20:45 +08:00			`else:`
fix download bug when use multi gpus (#13610) 2024-08-06 21:15:52 +08:00			`# Mainly used to solve the problem of downloading data from different`
			`# machines in the case of multiple machines. Different nodes will download`
			`# data, and the same node will only download data once.`
			`if dist.get_rank() == 0:`
			`_download(url, save_path)`
			`else:`
			`while not os.path.exists(save_path):`
			`time.sleep(1)`


			`def _download(url, save_path):`
			`"""`
			`Download from url, save to path.`

			`url (str): download url`
			`save_path (str): download to given path`
			`"""`
			`logger = get_logger()`

			`fname = osp.split(url)[-1]`
			`retry_cnt = 0`

			`while not osp.exists(save_path):`
			`if retry_cnt < DOWNLOAD_RETRY_LIMIT:`
			`retry_cnt += 1`
			`else:`
			`raise RuntimeError(`
			`"Download from {} failed. " "Retry limit reached".format(url)`
			`)`

			`try:`
			`req = requests.get(url, stream=True)`
			`except Exception as e: # requests.exceptions.ConnectionError`
			`logger.info(`
			`"Downloading {} from {} failed {} times with exception {}".format(`
			`fname, url, retry_cnt + 1, str(e)`
			`)`
			`)`
			`time.sleep(1)`
			`continue`

			`if req.status_code != 200:`
			`raise RuntimeError(`
			`"Downloading from {} failed with code "`
			`"{}!".format(url, req.status_code)`
			`)`

			`# For protecting download interupted, download to`
			`# tmp_file firstly, move tmp_file to save_path`
			`# after download finished`
			`tmp_file = save_path + ".tmp"`
			`total_size = req.headers.get("content-length")`
			`with open(tmp_file, "wb") as f:`
			`if total_size:`
			`with tqdm(total=(int(total_size) + 1023) // 1024) as pbar:`
			`for chunk in req.iter_content(chunk_size=1024):`
			`f.write(chunk)`
			`pbar.update(1)`
			`else:`
			`for chunk in req.iter_content(chunk_size=1024):`
			`if chunk:`
			`f.write(chunk)`
			`shutil.move(tmp_file, save_path)`

			`return save_path`
merge dygraph 2021-06-10 14:24:59 +08:00

			`def maybe_download(model_storage_directory, url):`
			`# using custom model`
add pre-commit workflow (#11973) * add pre-commit workflow * run 'pre-commit run --all-files' * setup python version 2024-04-21 21:46:20 +08:00			`tar_file_name_list = [".pdiparams", ".pdiparams.info", ".pdmodel"]`
merge dygraph 2021-06-10 14:24:59 +08:00			`if not os.path.exists(`
add pre-commit workflow (#11973) * add pre-commit workflow * run 'pre-commit run --all-files' * setup python version 2024-04-21 21:46:20 +08:00			`os.path.join(model_storage_directory, "inference.pdiparams")`
			`) or not os.path.exists(os.path.join(model_storage_directory, "inference.pdmodel")):`
			`assert url.endswith(".tar"), "Only supports tar compressed package"`
			`tmp_path = os.path.join(model_storage_directory, url.split("/")[-1])`
			`print("download {} to {}".format(url, tmp_path))`
merge dygraph 2021-06-10 14:24:59 +08:00			`os.makedirs(model_storage_directory, exist_ok=True)`
			`download_with_progressbar(url, tmp_path)`
add pre-commit workflow (#11973) * add pre-commit workflow * run 'pre-commit run --all-files' * setup python version 2024-04-21 21:46:20 +08:00			`with tarfile.open(tmp_path, "r") as tarObj:`
merge dygraph 2021-06-10 14:24:59 +08:00			`for member in tarObj.getmembers():`
			`filename = None`
			`for tar_file_name in tar_file_name_list:`
add layout model 2022-08-17 04:40:07 +00:00			`if member.name.endswith(tar_file_name):`
add pre-commit workflow (#11973) * add pre-commit workflow * run 'pre-commit run --all-files' * setup python version 2024-04-21 21:46:20 +08:00			`filename = "inference" + tar_file_name`
merge dygraph 2021-06-10 14:24:59 +08:00			`if filename is None:`
			`continue`
			`file = tarObj.extractfile(member)`
add pre-commit workflow (#11973) * add pre-commit workflow * run 'pre-commit run --all-files' * setup python version 2024-04-21 21:46:20 +08:00			`with open(os.path.join(model_storage_directory, filename), "wb") as f:`
merge dygraph 2021-06-10 14:24:59 +08:00			`f.write(file.read())`
			`os.remove(tmp_path)`


support auto download model from bos (#9349) 2023-03-08 19:21:28 +08:00			`def maybe_download_params(model_path):`
update tipc to_static (#9369) * add d2s train for slanet and v3 * fix bug * udpate tipc to_static * update db * remove_print * update benchmark_train.sh * update maybe_download_params 2023-03-10 19:07:37 +08:00			`if os.path.exists(model_path) or not is_link(model_path):`
support auto download model from bos (#9349) 2023-03-08 19:21:28 +08:00			`return model_path`
			`else:`
			`url = model_path`
add pre-commit workflow (#11973) * add pre-commit workflow * run 'pre-commit run --all-files' * setup python version 2024-04-21 21:46:20 +08:00			`tmp_path = os.path.join(MODELS_DIR, url.split("/")[-1])`
			`print("download {} to {}".format(url, tmp_path))`
support auto download model from bos (#9349) 2023-03-08 19:21:28 +08:00			`os.makedirs(MODELS_DIR, exist_ok=True)`
			`download_with_progressbar(url, tmp_path)`
			`return tmp_path`


merge dygraph 2021-06-10 14:24:59 +08:00			`def is_link(s):`
add pre-commit workflow (#11973) * add pre-commit workflow * run 'pre-commit run --all-files' * setup python version 2024-04-21 21:46:20 +08:00			`return s is not None and s.startswith("http")`
merge dygraph 2021-06-10 14:24:59 +08:00

			`def confirm_model_dir_url(model_dir, default_model_dir, default_url):`
			`url = default_url`
			`if model_dir is None or is_link(model_dir):`
			`if is_link(model_dir):`
			`url = model_dir`
add pre-commit workflow (#11973) * add pre-commit workflow * run 'pre-commit run --all-files' * setup python version 2024-04-21 21:46:20 +08:00			`file_name = url.split("/")[-1][:-4]`
merge dygraph 2021-06-10 14:24:59 +08:00			`model_dir = default_model_dir`
			`model_dir = os.path.join(model_dir, file_name)`
			`return model_dir, url`