forked from microsoft/computervision-recipes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.py
144 lines (116 loc) · 3.9 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import os
from pathlib import Path
import requests
from typing import List, Union
from urllib.parse import urlparse
from zipfile import ZipFile
def data_path() -> Path:
"""Get the data directory path"""
data_dir = Path(
os.path.realpath(
os.path.join(
os.path.dirname(__file__), os.pardir, os.pardir, "data"
)
)
)
data_dir.mkdir(exist_ok=True)
return data_dir
def download(url: str, loc: str):
""" Download contents of a url into 'loc'"""
r = requests.get(url)
with open(loc, 'wb') as f:
f.write(r.content)
return loc
def get_files_in_directory(
directory: str, suffixes: List[str] = None
) -> List[str]:
"""Returns all filenames in a directory which optionally match one of multiple suffixes.
Args:
directory: directory to scan for files.
suffixes: only keep the filenames which ends with one of the suffixes
(e.g. suffixes = [".jpg", ".png", ".gif"]).
Return:
List of filenames
"""
if not os.path.exists(directory):
raise Exception(f"Directory '{directory}' does not exist.")
filenames = [str(p) for p in Path(directory).iterdir() if p.is_file()]
if suffixes and suffixes != "":
filenames = [
s for s in filenames if s.lower().endswith(tuple(suffixes))
]
return sorted(filenames)
def _get_file_name(url: str) -> str:
""" Get a file name based on url. """
return urlparse(url).path.split("/")[-1]
def unzip_url(
url: str,
fpath: Union[Path, str] = None,
dest: Union[Path, str] = None,
exist_ok: bool = False,
) -> str:
""" Download file from URL to {fpath} and unzip to {dest}.
{fpath} and {dest} must be directories
Args:
url (str): url to download from
fpath (Union[Path, str]): The location to save the url zip file to
dest (Union[Path, str]): The destination to unzip {fpath}
exist_ok (bool): if exist_ok, then skip if exists, otherwise throw error
Raises:
FileExistsError: if file exists
Returns:
Path of {dest}
"""
def _raise_file_exists_error(path: Union[Path, str]) -> None:
if not exist_ok:
raise FileExistsError(path, "Use param {{exist_ok}} to ignore.")
if fpath is None and dest is None:
fpath = data_path()
dest = data_path()
if fpath is None and dest is not None:
fpath = dest
if dest is None and fpath is not None:
dest = fpath
os.makedirs(dest, exist_ok=True)
os.makedirs(fpath, exist_ok=True)
fname = _get_file_name(url)
fname_without_extension = fname.split(".")[0]
zip_file = Path(os.path.join(fpath, fname))
unzipped_dir = Path(os.path.join(dest, fname_without_extension))
# download zipfile if zipfile not exists
if zip_file.is_file():
_raise_file_exists_error(zip_file)
else:
r = requests.get(url)
f = open(zip_file, "wb")
f.write(r.content)
f.close()
# unzip downloaded zipfile if dir not exists
if unzipped_dir.is_dir():
_raise_file_exists_error(unzipped_dir)
else:
z = ZipFile(zip_file, "r")
z.extractall(fpath)
z.close()
return os.path.realpath(unzipped_dir)
def unzip_urls(
urls: List[str], dest: Union[Path, str] = data_path()
) -> List[str]:
""" Download and unzip all datasets in Urls to dest """
# make dir if not exist
if not Path(dest).is_dir():
os.makedirs(dest)
# download all data urls
paths = list()
for url in urls:
paths.append(unzip_url(url, fpath=dest, dest=dest, exist_ok=True))
return paths
def root_path() -> Path:
"""Get path of root dir."""
return Path(
os.path.realpath(
os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)
)
)