1
0
mirror of https://github.com/exaloop/codon.git synced 2025-06-03 15:03:52 +08:00
codon/stdlib/numpy/npio.codon
A. R. Shajii b8c1eeed36
2025 updates (#619)
* 2025 updates

* Update ci.yml
2025-01-29 15:41:43 -05:00

2358 lines
74 KiB
Python

# Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>
from .ndarray import ndarray
from .routines import array, empty, asarray, atleast_2d
import util
import gzip
import re
_ALLOCATIONGRANULARITY: Static[int] = 16384
_PROT_NONE: Static[int] = 0
_PROT_READ: Static[int] = 1
_PROT_WRITE: Static[int] = 2
_PROT_EXEC: Static[int] = 4
_MAP_SHARED: Static[int] = 1
_MAP_PRIVATE: Static[int] = 2
_MAGIC_PREFIX: Static[str] = '\x93NUMPY'
_ARRAY_ALIGN: Static[int] = 64
##################
# Memory Mapping #
##################
def _io_error(base_msg: str):
from C import seq_check_errno() -> str
c_msg = seq_check_errno()
raise IOError(f"{base_msg}: {c_msg}" if c_msg else base_msg)
def _mmap(f, length: int, access: int, flags: int, offset: int = 0):
from C import mmap(cobj, int, i32, i32, i32, int) -> cobj
from C import fileno(cobj) -> i32
fd = fileno(f.fp)
mm = mmap(cobj(), length, i32(access), i32(flags), fd, offset)
if int(mm) == -1:
_io_error("mmap() call failed")
return mm
def _munmap(p: cobj, length: int):
from C import munmap(cobj, int) -> i32
if munmap(p, length) == i32(-1):
_io_error("munmap() call failed")
def _fix_mmap_mode(mode: str):
if mode == 'readonly':
mode = 'r'
elif mode == 'copyonwrite':
mode = 'c'
elif mode == 'readwrite':
mode = 'r+'
elif mode == 'write':
mode = 'w+'
elif mode not in ('r', 'c', 'r+', 'w+'):
raise ValueError(
f"mode must be one of ['r', 'c', 'r+', 'w+', 'readonly', 'copyonwrite', 'readwrite', 'write'] (got {repr(mode)})"
)
return mode
def _memmap(fid, dtype: type, mode: str, offset: int, shape, forder: bool):
fid.seek(0, 2)
flen = fid.tell()
dbytes = util.sizeof(dtype)
if shape is None:
nbytes = flen - offset
if nbytes % dbytes:
raise ValueError("Size of available data is not a "
"multiple of the data-type size.")
size = nbytes // dbytes
sh = (size, )
else:
if isinstance(shape, int):
sh = (shape, )
else:
sh = shape
size = util.count(shape)
nbytes = offset + size * dbytes
if mode in ('w+', 'r+') and flen < nbytes:
fid.seek(nbytes - 1, 0)
fid.write('\0')
fid.flush()
if mode == 'c':
flags = _MAP_PRIVATE
acc = _PROT_READ | _PROT_WRITE
elif mode == 'r':
flags = _MAP_SHARED
acc = _PROT_READ
else:
flags = _MAP_SHARED
acc = _PROT_READ | _PROT_WRITE
start = offset - offset % _ALLOCATIONGRANULARITY
nbytes -= start
array_offset = offset - start
mm = _mmap(fid, length=nbytes, access=acc, flags=flags, offset=start)
return ndarray(sh, Ptr[dtype](mm + array_offset), fcontig=forder)
def memmap(filename,
dtype: type = u8,
mode: str = 'r+',
offset: int = 0,
shape=None,
order: str = 'C'):
mode = _fix_mmap_mode(mode)
if mode == 'w+' and shape is None:
raise ValueError("shape must be given if mode == 'w+'")
if offset < 0:
raise ValueError(
f"memmap() 'offset' cannot be negative (got {offset})")
forder = False
if order == 'C' or order == 'c':
forder = False
elif order == 'F' or order == 'f':
forder = True
else:
raise ValueError(f"memmap() 'order' must be 'C' or 'F' (got {order})")
if hasattr(filename, 'read'):
return _memmap(filename,
dtype=dtype,
mode=mode,
offset=offset,
shape=shape,
forder=forder)
elif isinstance(filename, str):
with open(filename, ('r' if mode == 'c' else mode) + 'b') as f:
return _memmap(f,
dtype=dtype,
mode=mode,
offset=offset,
shape=shape,
forder=forder)
else:
compile_error("filename must be a string or file handle")
###############
# General I/O #
###############
def parse_header(header: str):
regex = (
r"{\s*"
r"(?:"
r"(?:'descr'|\"descr\")\s*:\s*'(?P<descr>.*?)'\s*,?\s*"
r"|"
r"(?:'fortran_order'|\"fortran_order\")\s*:\s*(?P<fortran_order>True|False)\s*,?\s*"
r"|"
r"(?:'shape'|\"shape\")\s*:\s*(?P<shape>\(\)|\(\d+(?:, \d+)*(?:,)?\))\s*,?\s*"
r"){3}"
r"\s*}")
m = re.match(regex, header)
if m:
# Use the named groups to access the matched values
dtype = m.group(1)
fortran_order = m.group(2)
shape = m.group(3)
return shape, fortran_order, dtype
else:
raise ValueError(f"Cannot parse header: {repr(header)}")
def _load(f, mmap_mode: Optional[str], ndim: Static[int], dtype: type):
magic = f.read(len(_MAGIC_PREFIX))
if magic != _MAGIC_PREFIX:
raise ValueError("Invalid magic string.")
# Extract the major and minor version numbers
major = f.read(1)
minor = f.read(1)
if (major != '\x01' and major != '\x02'
and major != '\x03') or minor != '\x00':
raise ValueError("Invalid version numbers.")
# Extract the header length as a little-endian unsigned int
if major == '\x01':
header_len_enc = f.read(2)
header_len = (int(header_len_enc.ptr[1]) << 8) | int(
header_len_enc.ptr[0])
else:
header_len_enc = f.read(4)
header_len = ((int(header_len_enc.ptr[3]) << 24) |
(int(header_len_enc.ptr[2]) << 16) |
(int(header_len_enc.ptr[1]) << 8)
| int(header_len_enc.ptr[0]))
# Read the header data
header_data = f.read(header_len)
# Deserialize the header dictionary data
shape, fortran_order, dt = parse_header(header_data)
# Extract shape information from the header data
elements = [
elem.strip() for elem in shape[1:-1].split(',') if elem.strip()
]
if len(elements) != ndim:
raise ValueError(
f"Loaded array has dimension {len(elements)}, but expected dimension {ndim} (specified by 'ndim' argument)"
)
shape = tuple(int(elements[i]) for i in staticrange(ndim))
forder = (fortran_order == 'True')
# Read the binary data
if mmap_mode is None:
str_data = f.read()
binary_data = str_data.ptr
got_bytes = len(str_data)
else:
arr_data = _memmap(f, dtype=byte, mode=mmap_mode, offset=f.tell(), shape=None, forder=forder)
binary_data = arr_data.data
got_bytes = arr_data.size
exp_bytes = util.count(shape) * util.sizeof(dtype)
if got_bytes != exp_bytes:
raise ValueError(
f"Unexpected number of bytes read from file for given array shape and dtype (expected {exp_bytes} but got {got_bytes})"
)
# Create a ndarray from the binary data
data = Ptr[dtype](binary_data)
array = ndarray[dtype, ndim](shape, data, fcontig=forder)
if dt.startswith('>') and mmap_mode is None:
array.byteswap(inplace=True)
return array
def load(file, mmap_mode: Optional[str] = None, ndim: Static[int] = 1, dtype: type = float):
if mmap_mode is not None:
mmap_mode = _fix_mmap_mode(mmap_mode)
if mmap_mode == 'w+':
raise ValueError("cannot use mmap_mode='w+' in load()")
if hasattr(file, 'read'):
return _load(file, mmap_mode=mmap_mode, ndim=ndim, dtype=dtype)
elif isinstance(file, str):
open_mode = 'rb'
if mmap_mode is not None:
open_mode = ('r' if mmap_mode == 'c' else mmap_mode) + 'b'
with open(file, open_mode) as f:
return _load(f, mmap_mode=mmap_mode, ndim=ndim, dtype=dtype)
else:
compile_error("fname must be a string or file handle")
def _save(f, arr):
arr = asarray(arr)
cc, fc = arr._contig
fortran_order = (fc and not cc)
header_parts = ("{'descr': '",
util.dtype_to_str(arr.dtype, include_byteorder=True),
"', 'fortran_order': ",
"True, 'shape': " if fortran_order else "False, 'shape': ",
str(arr.shape), ", }")
header_len = sum(len(h) for h in header_parts) + 1 # +1 for newline
long_header = False
if header_len > 0xffffffff:
raise ValueError("Header is too long for .npy format")
elif header_len > 0xffff:
long_header = True
else:
long_header = False
f.write(_MAGIC_PREFIX)
f.write('\x02\x00' if long_header else '\x01\x00')
m = len(_MAGIC_PREFIX) + 2 + (4 if long_header else 2) + header_len
rem = m % _ARRAY_ALIGN
spaces = (_ARRAY_ALIGN - rem) if rem else 0
header_len += spaces
if long_header:
byte1 = byte(header_len & 0xFF)
byte2 = byte((header_len >> 8) & 0xFF)
byte3 = byte((header_len >> 16) & 0xFF)
byte4 = byte((header_len >> 24) & 0xFF)
f.write(str(__ptr__(byte1), 1))
f.write(str(__ptr__(byte2), 1))
f.write(str(__ptr__(byte3), 1))
f.write(str(__ptr__(byte4), 1))
else:
byte1 = byte(header_len & 0xFF)
byte2 = byte((header_len >> 8) & 0xFF)
f.write(str(__ptr__(byte1), 1))
f.write(str(__ptr__(byte2), 1))
for h in header_parts:
f.write(h)
for _ in range(spaces):
f.write('\x20')
f.write('\n')
if cc or fc:
f.write(str(arr.data.as_byte(), arr.nbytes))
else:
for idx in util.multirange(arr.shape):
e = arr._ptr(idx)[0]
s = str(__ptr__(e).as_byte(), util.sizeof(arr.dtype))
f.write(s)
def save(file, arr):
if hasattr(file, 'write'):
_save(file, arr)
elif isinstance(file, str):
if not file.endswith('.npy'):
file += '.npy'
with open(file, 'wb') as f:
_save(f, arr)
else:
compile_error("fname must be a string or file handle")
def _savetxt(f, X, delimiter: str, newline: str, header: str, footer: str,
comments: str):
X = asarray(X)
if header:
header = header.replace('\n', '\n' + comments)
f.write(comments)
f.write(header)
f.write(newline)
if X.ndim == 1:
for x in X:
f.write(str(x))
f.write(newline)
elif X.ndim == 2:
m, n = X.shape
if m and n:
for i in range(m):
for j in range(n):
x = X._ptr((i, j))[0]
f.write(str(x))
if j < n - 1:
f.write(delimiter)
f.write(newline)
else:
compile_error("Expected 1D or 2D array")
if footer:
footer = footer.replace('\n', '\n' + comments)
f.write(comments)
f.write(footer)
f.write(newline)
def savetxt(fname,
X,
delimiter: str = ' ',
newline: str = '\n',
header: str = '',
footer: str = '',
comments: str = '# '):
if hasattr(fname, 'write'):
_savetxt(fname,
X,
delimiter=delimiter,
newline=newline,
header=header,
footer=footer,
comments=comments)
elif isinstance(fname, str):
if fname.endswith('.gz'):
with gzip.open(fname, 'w9') as f:
_savetxt(f,
X,
delimiter=delimiter,
newline=newline,
header=header,
footer=footer,
comments=comments)
else:
with open(fname, 'w') as f:
_savetxt(f,
X,
delimiter=delimiter,
newline=newline,
header=header,
footer=footer,
comments=comments)
else:
compile_error("fname must be a string or a file handle")
def _fromfile(f, dtype: type, count: int, sep: str, offset: int):
if sep:
if offset:
raise TypeError(
"'offset' argument only permitted for binary files")
string_data = f.read()
if sep.isspace():
string_split = string_data.split(None, count)
else:
string_split = string_data.split(sep, count)
n = len(string_split)
p = Ptr[dtype](n)
for i in range(n):
p[i] = dtype(string_split[i])
else:
if offset:
f.seek(offset, 1)
if count < 0:
binary_data = f.read()
else:
binary_data = f.read(count * util.sizeof(dtype))
p = Ptr[dtype](binary_data.ptr)
n = len(binary_data) // util.sizeof(dtype)
return ndarray((n, ), p)
def fromfile(file,
dtype: type = float,
count: int = -1,
sep: str = '',
offset: int = 0):
if hasattr(file, 'read'):
return _fromfile(file,
dtype=dtype,
count=count,
sep=sep,
offset=offset)
elif isinstance(file, str):
with open(file, 'rb') as f:
return _fromfile(f,
dtype=dtype,
count=count,
sep=sep,
offset=offset)
else:
compile_error("fname must be a string or a file handle")
def fromstring(string: str,
dtype: type = float,
count: int = -1,
sep: str = ''):
if sep:
split = string.split(sep, count)
k = len(split)
n = count if count >= 0 else k
result = empty((n, ), dtype=dtype)
p = result.data
for i in range(k if count < 0 else min(k, count)):
p[i] = dtype(split[i])
return result
else:
if count < 0:
if len(string) < util.sizeof(dtype):
raise ValueError("string is smaller than requested size")
if len(string) % util.sizeof(dtype) != 0:
raise ValueError(
"string size must be a multiple of element size")
else:
if len(string) < count * util.sizeof(dtype):
raise ValueError("string is smaller than requested size")
n = count if count >= 0 else (len(string) // util.sizeof(dtype))
return ndarray((n, ), Ptr[dtype](string.ptr))
def _tofile(arr, f, sep: str):
if sep:
k = 0
n = arr.size
for idx in util.multirange(arr.shape):
e = arr._ptr(idx)[0]
f.write(str(e))
if k < n - 1:
f.write(sep)
k += 1
else:
cc, _ = arr._contig
if cc:
f.write(str(arr.data.as_byte(), arr.nbytes))
else:
for idx in util.multirange(arr.shape):
e = arr._ptr(idx)[0]
s = str(__ptr__(e).as_byte(), util.sizeof(arr.dtype))
f.write(s)
@extend
class ndarray:
def tofile(self, file, sep: str = ''):
if hasattr(file, 'write'):
_tofile(self, file, sep=sep)
elif isinstance(file, str):
with open(file, 'w' if sep else 'wb') as f:
_tofile(self, f, sep=sep)
else:
compile_error("fname must be a string or file handle")
########################
# loadtxt / genfromtxt #
########################
_NEWLINE: Static[int] = 10
_CARTRIDGE: Static[int] = 13
_DEFAULT_ROWS: Static[int] = 512
@tuple
class Converters:
funcs: F
mask: M
usecols: U
dtype: type
F: type
M: type
U: type
def __new__(funcs, mask, usecols,
dtype: type) -> Converters[dtype, F, M, U]:
return (funcs, mask, usecols)
def __call__(self, field: str, idx: int):
usecols = self.usecols
if usecols is not None:
idx = usecols[idx]
if self.mask[idx]:
return self.funcs[idx](field)
else:
return self.dtype(field)
def normalize_col(col: int, num_fields: int):
if col >= num_fields or col < -num_fields:
raise ValueError(
f"given column {col} is out of bounds (file has {num_fields} columns)"
)
elif col < 0:
return col + num_fields
else:
return col
def default_fill(dtype: type):
if dtype is bool:
return False
elif dtype is int or isinstance(dtype, Int) or isinstance(dtype, UInt):
return dtype(-1)
elif dtype is float or dtype is float32 or dtype is float16:
return util.nan(dtype)
elif dtype is complex:
return complex(util.nan64(), 0.0)
elif dtype is complex64:
return complex64(util.nan32(), float32(0.0))
elif dtype is str:
return '???'
else:
return util.zero(dtype)
def malformed(row: int, num_fields: int):
raise IOError(
f"inconsistent number of fields in file (row = {row}, expected fields = {num_fields})"
)
def min_dim(arr: ndarray, ndmin: Static[int]):
if arr.ndim == 1 and ndmin == 2:
return arr.reshape(arr.size, 1)
else:
return arr
def make_conv(converters, num_fields: int, usecols, dtype: type):
if isinstance(converters, Dict):
funcs = Ptr[converters.V](num_fields)
mask = Ptr[bool](num_fields)
for i in range(num_fields):
mask[i] = False
for k, v in converters.items():
col = normalize_col(k, num_fields)
mask[col] = True
funcs[col] = v
return Converters[dtype, type(funcs),
type(mask),
type(usecols)](funcs, mask, usecols, dtype)
else:
return converters
class CSVReader:
_path: str
_delimiter: byte
_quotechar: byte
_comments: str
_mmap_ptr: cobj
_mmap_len: int
def __init__(self,
path: str,
delimiter: str = ',',
quotechar: str = '"',
comments: str = ''):
dm = byte(0)
if len(delimiter) == 1:
dm = delimiter.ptr[0]
elif len(delimiter) != 0:
raise ValueError("'delimiter' must be a length-1 string")
qc = byte(0)
if len(quotechar) == 1:
qc = quotechar.ptr[0]
elif len(quotechar) != 0:
raise ValueError("'quotechar' must be a length-1 string")
self._path = path
self._delimiter = dm
self._quotechar = qc
self._comments = comments
self._mmap_ptr = cobj()
self._mmap_len = 0
def __enter__(self):
with open(self._path) as f:
f.seek(0, 2)
n = f.tell()
if n > 0:
self._mmap_ptr = _mmap(f, length=n, access=_PROT_READ, flags=_MAP_SHARED)
self._mmap_len = n
if int(self._mmap_ptr) == -1:
raise IOError("CSVReader error: mmap() failed")
def __exit__(self):
if self._mmap_len > 0:
_munmap(self._mmap_ptr, self._mmap_len)
self._mmap_ptr = cobj()
self._mmap_len = 0
def is_delimiter(self, c: byte):
delimiter = self._delimiter
if delimiter:
return c == delimiter
else:
return bool(_C.isspace(i32(int(c))))
def is_comment(self, c: byte):
comments = self._comments
for i in range(len(comments)):
if comments.ptr[i] == c:
return True
return False
def skip_delimiter(self, i: int):
delimiter = self._delimiter
i += 1
# Single-char case
if delimiter:
return i
# Whitespace case
p = self._mmap_ptr
n = self._mmap_len
while i < n:
c = p[i]
if (c == byte(_NEWLINE) or c == byte(_CARTRIDGE)
or not self.is_delimiter(c)):
break
i += 1
return i
def skip_comments(self, i: int):
if not self._comments:
return i
p = self._mmap_ptr
n = self._mmap_len
while i < n:
c = p[i]
if self.is_comment(c):
i += 1
while i < n:
c = p[i]
i += 1
if c == byte(_NEWLINE) or c == byte(_CARTRIDGE):
if c == byte(_CARTRIDGE) and i < n and p[i] == byte(
_NEWLINE):
i += 1
break
else:
break
return i
def skip_lines(self, i: int, skip: int):
p = self._mmap_ptr
n = self._mmap_len
skipped = 0
while i < n and skipped < skip:
c = p[i]
if (c == byte(_NEWLINE) or c == byte(_CARTRIDGE)):
i += 1
if c == byte(_CARTRIDGE) and i < n and p[i] == byte(_NEWLINE):
i += 1
skipped += 1
else:
i += 1
return i
def get_num_fields(self, i0: int):
p = self._mmap_ptr
n = self._mmap_len
quotechar = self._quotechar
comments = self._comments
if n == 0:
return 0
quote = False
num_fields = 1
i = self.skip_comments(i0)
if i >= n:
return 0
# Parse first row to get field count
while i < n:
c = p[i]
if quotechar and c == quotechar:
if quote:
if i + 1 < n and p[i + 1] == quotechar:
i += 2
else:
quote = False
i += 1
else:
quote = True
i += 1
elif (c == byte(_NEWLINE) or c == byte(_CARTRIDGE)) and not quote:
i += 1
if c == byte(_CARTRIDGE) and i < n and p[i] == byte(_NEWLINE):
i += 1
break
elif self.is_delimiter(c) and not quote:
num_fields += 1
i = self.skip_delimiter(i)
elif self.is_comment(c) and not quote:
break
else:
i += 1
return num_fields
def parse(self, row_callback, num_fields: int, maxrows: int, i0: int):
p = self._mmap_ptr
n = self._mmap_len
quotechar = self._quotechar
if n == 0 or num_fields == 0 or maxrows == 0:
return
quote = False
field = 0
fields = Ptr[str](num_fields)
row = 0
i = self.skip_comments(i0)
last_field_start = i
while i < n:
c = p[i]
if quotechar and c == quotechar:
if quote:
if i + 1 < n and p[i + 1] == quotechar:
i += 2
else:
quote = False
i += 1
else:
quote = True
i += 1
elif (c == byte(_NEWLINE) or c == byte(_CARTRIDGE)) and not quote:
if field != num_fields - 1:
malformed(row, num_fields)
fields[field] = str(p + last_field_start, i - last_field_start)
row_callback(fields, num_fields, row)
i += 1
if c == byte(_CARTRIDGE) and i < n and p[i] == byte(_NEWLINE):
i += 1
i = self.skip_comments(i)
last_field_start = i
field = 0
row += 1
if maxrows >= 0 and row >= maxrows:
break
elif self.is_delimiter(c) and not quote:
fields[field] = str(p + last_field_start, i - last_field_start)
field += 1
if field >= num_fields:
malformed(row, num_fields)
i = self.skip_delimiter(i)
last_field_start = i
elif self.is_comment(c) and not quote:
if field != num_fields - 1:
malformed(row, num_fields)
fields[field] = str(p + last_field_start, i - last_field_start)
row_callback(fields, num_fields, row)
i = self.skip_comments(i)
last_field_start = i
field = 0
row += 1
if maxrows >= 0 and row >= maxrows:
break
else:
i += 1
if field > 0:
if field != num_fields - 1:
malformed(row, num_fields)
if maxrows < 0 or row <= maxrows:
fields[field] = str(p + last_field_start, i - last_field_start)
row_callback(fields, num_fields, row)
class ArrayUpdate:
arr: A
cols: C
conv: F
last_row: int
A: type
C: type
F: type
def __init__(self, arr: A, cols: C, conv: F):
self.arr = arr
self.cols = cols
self.conv = conv
self.last_row = 0
@property
def cap(self):
arr = self.arr
if isinstance(arr, Tuple):
return arr[0].size
else:
return arr.shape[0]
def resize_arrays(self, new_cap: int):
def resize_one(a, new_cap: int):
if a.ndim == 1:
data_new = util.realloc(a.data, new_cap, a.size)
return ndarray((new_cap, ), data_new)
else:
rows, cols = a.shape
data_new = util.realloc(a.data, new_cap * cols, a.size)
return ndarray((new_cap, cols), data_new)
arr = self.arr
if isinstance(arr, Tuple):
self.arr = tuple(resize_one(a, new_cap) for a in arr)
else:
self.arr = resize_one(arr, new_cap)
def trim_arrays(self):
if self.last_row < self.cap - 1:
self.resize_arrays(self.last_row + 1)
def convert(self, field: str, idx: int, dtype: type) -> dtype:
conv = self.conv
if conv is None:
r = dtype(field)
elif isinstance(conv, Converters):
r = conv(field, idx)
elif hasattr(conv, '__call__'):
r = conv(field)
else:
r = conv[idx](field)
# Make sure we copy strings out of the mmap buffer
if isinstance(r, str):
return r.__ptrcopy__()
else:
return r
def convert_static(self, field: str, idx: Static[int],
dtype: type) -> dtype:
conv = self.conv
if conv is None:
r = dtype(field)
elif isinstance(conv, Converters):
r = conv(field, idx)
elif hasattr(conv, '__call__'):
r = conv(field)
else:
r = conv[idx](field)
# Make sure we copy strings out of the mmap buffer
if isinstance(r, str):
return r.__ptrcopy__()
else:
return r
def __call__(self, fields: Ptr[str], num_fields: int, row: int):
def get_new_size(size: int, min_grow: int = 512):
new_size = size
growth = size >> 2
if growth <= min_grow:
new_size += min_grow
else:
new_size += growth + min_grow - 1
new_size &= ~min_grow
return new_size
arr = self.arr
cols = self.cols
cap = self.cap
if row >= cap:
new_cap = get_new_size(cap)
self.resize_arrays(new_cap)
# Lots of different cases to consider...
if cols is not None:
if isinstance(arr, Tuple):
for i in staticrange(staticlen(cols)):
col = cols[i]
arr[i].data[row] = self.convert_static(
fields[col], i, arr[i].dtype)
else:
if isinstance(arr.dtype, Tuple):
dummy = util.zero(arr.dtype)
tup = tuple(
self.convert(fields[cols[i]], i, type(dummy[i]))
for i in staticrange(staticlen(arr.dtype)))
arr._ptr((row, ))[0] = tup
else:
if arr.ndim == 1:
col = cols[0]
arr.data[row] = self.convert_static(
fields[col], 0, arr.dtype)
else:
for i in staticrange(staticlen(cols)):
col = cols[i]
arr._ptr((row, i))[0] = self.convert_static(
fields[col], i, arr.dtype)
else:
if isinstance(arr, Tuple):
for i in staticrange(staticlen(arr)):
arr[i]._ptr((row, ))[0] = self.convert_static(
fields[i], i, arr[i].dtype)
elif isinstance(arr.dtype, Tuple):
dummy = util.zero(arr.dtype)
tup = tuple(
self.convert(fields[i], i, type(dummy[i]))
for i in staticrange(staticlen(arr.dtype)))
arr._ptr((row, ))[0] = tup
else:
for i in range(num_fields):
arr._ptr(
(row, i))[0] = self.convert(fields[i], i, arr.dtype)
self.last_row = row
def loadtxt(fname: str,
dtype: type = float,
comments: Optional[str] = '#',
delimiter: Optional[str] = None,
converters=None,
skiprows: int = 0,
usecols=None,
unpack: Static[int] = False,
ndmin: Static[int] = 0,
max_rows: Optional[int] = None,
quotechar: Optional[str] = None):
if isinstance(usecols, int):
cols = (usecols, )
else:
cols = usecols
if cols is not None:
if staticlen(cols) == 0:
compile_error("cannot pass empty tuple to 'usecols'")
if ndmin != 0 and ndmin != 1 and ndmin != 2:
compile_error("'ndmin' must be 0, 1 or 2")
maxrows: int = max_rows if max_rows is not None else -1
block_size = maxrows if maxrows >= 0 else _DEFAULT_ROWS
with CSVReader(fname,
delimiter=(delimiter if delimiter is not None else ''),
quotechar=(quotechar if quotechar is not None else ''),
comments=comments if comments is not None else '') as csv:
i0 = csv.skip_lines(i=0, skip=skiprows) if skiprows > 0 else 0
num_fields = csv.get_num_fields(i0)
if cols is None:
if isinstance(dtype, Tuple):
if staticlen(dtype) != num_fields:
raise ValueError(
"number of fields is different than given tuple 'dtype' length"
)
if unpack:
dummy = util.zero(dtype)
arr = tuple(
empty((block_size, ), type(dummy[i]))
for i in staticrange(staticlen(dtype)))
else:
arr = empty((block_size, ), dtype)
else:
arr = empty((block_size, num_fields), dtype)
else:
if isinstance(dtype, Tuple):
if staticlen(dtype) != staticlen(cols):
compile_error(
"'usecols' has different length than given tuple 'dtype'"
)
cols = tuple(normalize_col(col, num_fields) for col in cols)
for i in range(1, len(cols)):
for j in range(i):
if cols[i] == cols[j]:
raise ValueError(
f"duplicate column {cols[i]} given in 'usecols'")
if staticlen(cols) == 1:
arr = empty((block_size, ), dtype)
else:
if unpack:
if isinstance(dtype, Tuple):
dummy = util.zero(dtype)
ncols: Static[int] = staticlen(cols)
arr = tuple(
empty((block_size, ), type(dummy[i]))
for i in staticrange(ncols))
else:
arr = tuple(empty((block_size, ), dtype) for _ in cols)
else:
if isinstance(dtype, Tuple):
arr = empty((block_size, ), dtype)
else:
arr = empty((block_size, len(cols)), dtype)
converters = make_conv(converters, num_fields, cols, dtype)
callback = ArrayUpdate(arr, cols, converters)
csv.parse(callback,
num_fields=num_fields,
maxrows=maxrows,
i0=i0)
callback.trim_arrays()
arr = callback.arr
if isinstance(arr, ndarray):
if unpack:
return min_dim(arr, ndmin).T
else:
return min_dim(arr, ndmin)
else:
return arr
class CSVReaderGen:
_path: str
_delimiter: byte
_comments: str
_names: List[str]
_mmap_ptr: cobj
_mmap_len: int
_length: int
def __init__(self, path: str, delimiter: str = ',', comments: str = ''):
dm = byte(0)
if len(delimiter) == 1:
dm = delimiter.ptr[0]
elif len(delimiter) != 0:
raise ValueError("'delimiter' must be a length-1 string")
self._path = path
self._delimiter = dm
self._comments = comments
self._names = []
self._mmap_ptr = cobj()
self._mmap_len = 0
self._length = 0
def __enter__(self):
if not self._path:
return
with open(self._path) as f:
f.seek(0, 2)
n = f.tell()
if n > 0:
self._mmap_ptr = _mmap(f, length=n, access=_PROT_READ, flags=_MAP_SHARED)
self._mmap_len = n
if int(self._mmap_ptr) == -1:
raise IOError("CSVReader error: mmap() failed")
def __exit__(self):
if self._mmap_len > 0:
_munmap(self._mmap_ptr, self._mmap_len)
self._mmap_ptr = cobj()
self._mmap_len = 0
def fix_names(self, excludelist: List[str], deletechars: str,
replace_space: str, upper: bool, lower: bool):
def fix_name(name: str, excludelist: List[str], deletechars: str,
replace_space: str, upper: bool, lower: bool):
s = _strbuf(capacity=len(name))
for c in name:
if c in deletechars:
continue
elif c.isspace():
s.append(replace_space)
elif upper:
s.append(c.upper())
elif lower:
s.append(c.lower())
else:
s.append(c)
if s.__str__() in excludelist:
s.append('_')
return s.__str__()
names = self._names
for i in range(len(names)):
names[i] = fix_name(names[i],
excludelist=excludelist,
deletechars=deletechars,
replace_space=replace_space,
upper=upper,
lower=lower)
def is_delimiter(self, c: byte):
delimiter = self._delimiter
if delimiter:
return c == delimiter
else:
return bool(_C.isspace(i32(int(c))))
def is_comment(self, c: byte):
comments = self._comments
for i in range(len(comments)):
if comments.ptr[i] == c:
return True
return False
def skip_lines(self, i: int, skip: int):
p = self._mmap_ptr
n = self._length
skipped = 0
while i < n and skipped < skip:
c = p[i]
if (c == byte(_NEWLINE) or c == byte(_CARTRIDGE)):
i += 1
if c == byte(_CARTRIDGE) and i < n and p[i] == byte(_NEWLINE):
i += 1
skipped += 1
else:
i += 1
return i
def find_length(self, skip_footer: int):
p = self._mmap_ptr
n = self._mmap_len
skipped = 0
if skip_footer <= 0:
self._length = n
return
i = n - 1
# Newline at the very end doesn't count
if i >= 0 and p[i] == byte(_NEWLINE):
i -= 1
while i >= 0:
c = p[i]
if c == byte(_NEWLINE):
skipped += 1
if skipped == skip_footer:
self._length = i + 1
return
i -= 1
self._length = 0
def skip_delimiter(self, i: int):
delimiter = self._delimiter
i += 1
# Single-char case
if delimiter:
return i
# Whitespace case
p = self._mmap_ptr
n = self._length
while i < n:
c = p[i]
if not self.is_delimiter(c):
break
i += 1
return i
def skip_delimiter(self, i: int, line: str):
delimiter = self._delimiter
i += 1
# Single-char case
if delimiter:
return i
# Whitespace case
p = line.ptr
n = len(line)
while i < n:
c = p[i]
if not self.is_delimiter(c):
break
i += 1
return i
def skip_comments(self, i: int):
if not self._comments:
return i
p = self._mmap_ptr
n = self._length
while i < n:
c = p[i]
if self.is_comment(c):
i += 1
while i < n:
c = p[i]
i += 1
if c == byte(_NEWLINE) or c == byte(_CARTRIDGE):
if c == byte(_CARTRIDGE) and i < n and p[i] == byte(
_NEWLINE):
i += 1
break
else:
break
return i
def get_num_fields(self, i: int, get_names: bool):
p = self._mmap_ptr
n = self._length
comments = self._comments
names = self._names
if i >= n:
return 0, i
i0 = i
num_fields = 1
if self.is_comment(p[0]):
i += 1
last_field_start = i
while i < n:
c = p[i]
if c == byte(_NEWLINE) or c == byte(_CARTRIDGE) or self.is_comment(
c):
if get_names:
name = str(p + last_field_start, i - last_field_start)
names.append(name.strip())
i += 1
if c == byte(_CARTRIDGE) and i < n and p[i] == byte(_NEWLINE):
i += 1
break
if self.is_delimiter(c):
if get_names:
name = str(p + last_field_start, i - last_field_start)
names.append(name.strip())
num_fields += 1
i = self.skip_delimiter(i)
last_field_start = i
else:
i += 1
return num_fields, i if get_names else i0
def partition_line(self, line: str, delimiter, num_fields: int, row: int,
invalid_raise: bool):
if isinstance(delimiter, int):
n = 0
i = 0
while i < len(line) and n < num_fields:
yield line[i:i + delimiter]
i += delimiter
n += 1
if n < num_fields:
if invalid_raise:
malformed(row, num_fields)
while True:
yield ''
n += 1
if n >= num_fields:
break
else:
n = 0
i = 0
while i < len(line) and n < len(delimiter):
d = delimiter[n]
if not isinstance(d, int):
compile_error(
"'delimiter' must be an int or a sequence of ints")
yield line[i:i + d]
i += d
n += 1
if n < num_fields:
if invalid_raise:
malformed(row, num_fields)
while True:
yield ''
n += 1
if n >= num_fields:
break
def get_num_fields_spaced(self, i: int, get_names: bool, delimiter):
p = self._mmap_ptr
n = self._length
comments = self._comments
names = self._names
if i >= n:
return 0, i
i0 = i
if self.is_comment(p[0]):
i += 1
line_start = i
line = ''
while i < n:
c = p[i]
if c == byte(_NEWLINE) or c == byte(_CARTRIDGE) or self.is_comment(
c):
line = str(p + line_start, i - line_start)
i += 1
if c == byte(_CARTRIDGE) and i < n and p[i] == byte(_NEWLINE):
i += 1
break
else:
i += 1
num_fields = 0
if isinstance(delimiter, int):
num_fields = len(line) // delimiter + (1 if len(line) %
delimiter else 0)
else:
num_fields = len(delimiter)
if get_names:
for name in self.partition_line(line,
delimiter=delimiter,
num_fields=num_fields,
row=0,
invalid_raise=False):
names.append(name)
return num_fields, i if get_names else i0
def get_num_fields_single(self, line: str, get_names: bool):
p = line.ptr
n = len(line)
names = self._names
i = 0
if n > 0 and self.is_comment(p[0]):
i += 1
last_field_start = i
num_fields = 0
while i < n:
c = p[i]
if c == byte(_NEWLINE) or c == byte(_CARTRIDGE) or self.is_comment(
c):
if get_names:
name = str(p + last_field_start, i - last_field_start)
names.append(name.strip())
num_fields += 1
break
elif self.is_delimiter(c):
if get_names:
name = str(p + last_field_start, i - last_field_start)
names.append(name.strip())
num_fields += 1
i = self.skip_delimiter(i, line)
last_field_start = i
else:
i += 1
return num_fields
def get_num_fields_single_spaced(self, line: str, get_names: bool,
delimiter):
p = line.ptr
n = len(line)
names = self._names
i = 0
if n > 0 and self.is_comment(p[0]):
i += 1
line_start = i
while i < n:
c = p[i]
if c == byte(_NEWLINE) or c == byte(_CARTRIDGE) or self.is_comment(
c):
line = str(p + line_start, i - line_start)
break
else:
i += 1
if isinstance(delimiter, int):
num_fields = len(line) // delimiter + (1 if len(line) %
delimiter else 0)
else:
num_fields = len(delimiter)
if get_names:
for name in self.partition_line(line,
delimiter=delimiter,
num_fields=num_fields,
row=0,
invalid_raise=False):
names.append(name)
return num_fields
def translate_cols(self, usecols, num_fields: int):
def translate_one(self, c, num_fields: int):
if isinstance(c, int):
return normalize_col(c, num_fields)
elif isinstance(c, str):
return self._names.index(c)
else:
compile_error("'usecols' elements must be either int or str")
cols = tuple(translate_one(self, c, num_fields) for c in usecols)
for i in range(1, len(cols)):
for j in range(i):
if cols[i] == cols[j]:
raise ValueError(
f"duplicate column {cols[i]} given in 'usecols'")
return cols
def parse(self, i: int, row_callback, num_fields: int, maxrows: int,
invalid_raise: bool):
p = self._mmap_ptr
n = self._length
if n == 0 or num_fields == 0 or maxrows == 0:
return
field = 0
fields = Ptr[str](num_fields)
row = 0
last_field_start = i
while i < n:
c = p[i]
if (c == byte(_NEWLINE) or c == byte(_CARTRIDGE)):
ok = True
if field != num_fields - 1:
if invalid_raise:
malformed(row, num_fields)
else:
ok = False
if ok:
fields[field] = str(p + last_field_start,
i - last_field_start)
row_callback(fields, num_fields, row)
i += 1
if c == byte(_CARTRIDGE) and i < n and p[i] == byte(_NEWLINE):
i += 1
i = self.skip_comments(i)
last_field_start = i
field = 0
row += 1
if maxrows >= 0 and row >= maxrows:
break
elif self.is_delimiter(c):
if invalid_raise or field < num_fields:
fields[field] = str(p + last_field_start,
i - last_field_start)
field += 1
ok = True
if field >= num_fields:
if invalid_raise:
malformed(row, num_fields)
else:
ok = False
i = self.skip_delimiter(i)
last_field_start = i
elif self.is_comment(c):
ok = True
if field != num_fields - 1:
if invalid_raise:
malformed(row, num_fields)
else:
ok = False
if ok:
fields[field] = str(p + last_field_start,
i - last_field_start)
row_callback(fields, num_fields, row)
i = self.skip_comments(i)
last_field_start = i
field = 0
row += 1
if maxrows >= 0 and row >= maxrows:
break
else:
i += 1
if field > 0:
ok = True
if field != num_fields - 1:
if invalid_raise:
malformed(row, num_fields)
else:
ok = False
if ok and (maxrows < 0 or row <= maxrows):
fields[field] = str(p + last_field_start, i - last_field_start)
row_callback(fields, num_fields, row)
def parse_spaced(self, i: int, row_callback, num_fields: int, maxrows: int,
invalid_raise: bool, delimiter):
p = self._mmap_ptr
n = self._length
if n == 0 or num_fields == 0 or maxrows == 0:
return
fields = Ptr[str](num_fields)
row = 0
last_line_start = i
while i < n:
c = p[i]
if (c == byte(_NEWLINE) or c == byte(_CARTRIDGE)):
line = str(p + last_line_start, i - last_line_start)
k = 0
for f in self.partition_line(line,
delimiter=delimiter,
num_fields=num_fields,
row=row,
invalid_raise=invalid_raise):
fields[k] = f
k += 1
row_callback(fields, num_fields, row)
i += 1
if c == byte(_CARTRIDGE) and i < n and p[i] == byte(_NEWLINE):
i += 1
i = self.skip_comments(i)
last_line_start = i
row += 1
if maxrows >= 0 and row >= maxrows:
break
elif self.is_comment(c):
line = str(p + last_line_start, i - last_line_start)
k = 0
for f in self.partition_line(line,
delimiter=delimiter,
num_fields=num_fields,
row=row,
invalid_raise=invalid_raise):
fields[k] = f
k += 1
row_callback(fields, num_fields, row)
i = self.skip_comments(i)
last_line_start = i
row += 1
if maxrows >= 0 and row >= maxrows:
break
else:
i += 1
if last_line_start < i and (maxrows < 0 or row < maxrows):
line = str(p + last_line_start, i - last_line_start)
k = 0
for f in self.partition_line(line,
delimiter=delimiter,
num_fields=num_fields,
row=row,
invalid_raise=invalid_raise):
fields[k] = f
k += 1
row_callback(fields, num_fields, row)
def parse_single(self, line: str, row_callback, row: int, fields: Ptr[str],
num_fields: int, invalid_raise: bool):
if not line:
return False
p = line.ptr
n = len(line)
i = 0
last_field_start = 0
field = 0
if n > 0 and self.is_comment(p[0]):
return False
while i < n:
c = p[i]
if c == byte(_NEWLINE) or c == byte(_CARTRIDGE) or self.is_comment(
c):
break
elif self.is_delimiter(c):
if field >= num_fields:
if invalid_raise:
malformed(row, num_fields)
else:
return False
fields[field] = str(p + last_field_start, i - last_field_start)
field += 1
i = self.skip_delimiter(i, line)
last_field_start = i
else:
i += 1
if i > last_field_start:
if field >= num_fields:
if invalid_raise:
malformed(row, num_fields)
else:
return False
fields[field] = str(p + last_field_start, i - last_field_start)
field += 1
if field != num_fields:
if invalid_raise:
malformed(row, num_fields)
else:
return False
row_callback(fields, num_fields, row)
return True
def parse_single_spaced(self, line: str, row_callback, row: int,
fields: Ptr[str], num_fields: int,
invalid_raise: bool, delimiter):
if not line:
return False
p = line.ptr
n = len(line)
i = 0
if n > 0 and self.is_comment(p[0]):
return False
while i < n:
c = p[i]
if c == byte(_NEWLINE) or c == byte(_CARTRIDGE) or self.is_comment(
c):
line = line[:i]
break
else:
i += 1
k = 0
for f in self.partition_line(line,
delimiter=delimiter,
num_fields=num_fields,
row=row,
invalid_raise=invalid_raise):
fields[k] = f
k += 1
row_callback(fields, num_fields, row)
return True
class ArrayUpdateGen:
arr: A
cols: C
conv: F
filling_values: M
autostrip: bool
loose: bool
last_row: int
A: type
C: type
F: type
M: type
def __init__(self, arr: A, cols: C, conv: F, filling_values: M,
autostrip: bool, loose: bool):
self.arr = arr
self.cols = cols
self.conv = conv
self.filling_values = filling_values
self.autostrip = autostrip
self.loose = loose
self.last_row = 0
@property
def cap(self):
arr = self.arr
if isinstance(arr, Tuple):
return arr[0].size
else:
return arr.shape[0]
def resize_arrays(self, new_cap: int):
def resize_one(a, new_cap: int):
if a.ndim == 1:
data_new = util.realloc(a.data, new_cap, a.size)
return ndarray((new_cap, ), data_new)
else:
rows, cols = a.shape
data_new = util.realloc(a.data, new_cap * cols, a.size)
return ndarray((new_cap, cols), data_new)
arr = self.arr
if isinstance(arr, Tuple):
self.arr = tuple(resize_one(a, new_cap) for a in arr)
else:
self.arr = resize_one(arr, new_cap)
def trim_arrays(self):
if self.last_row < self.cap - 1:
self.resize_arrays(self.last_row + 1)
def fill_value(self, idx: int, dtype: type) -> dtype:
filling_values = self.filling_values
if filling_values is None:
return default_fill(dtype)
elif (isinstance(filling_values, List)
or isinstance(filling_values, Tuple)
or isinstance(filling_values, ndarray)):
return filling_values[idx]
elif isinstance(filling_values, Dict):
default = default_fill(dtype)
if filling_values.K is int:
return filling_values.get(idx, default)
elif filling_values.K is str:
return filling_values.get(self._names[idx], default)
else:
compile_error("'filling_values' keys must be int or str")
else:
return filling_values
def fill_value_static(self, idx: Static[int], dtype: type) -> dtype:
filling_values = self.filling_values
if filling_values is None:
return default_fill(dtype)
elif (isinstance(filling_values, List)
or isinstance(filling_values, Tuple)
or isinstance(filling_values, ndarray)):
return filling_values[idx]
elif isinstance(filling_values, Dict):
default = default_fill(dtype)
if filling_values.K is int:
return filling_values.get(idx, default)
elif filling_values.K is str:
return filling_values.get(self._names[idx], default)
else:
compile_error("'filling_values' keys must be int or str")
else:
return filling_values
def convert(self, field: str, idx: int, dtype: type) -> dtype:
field0 = field
conv = self.conv
if self.autostrip:
field = field.strip()
if not field and conv is None:
r = self.fill_value(idx, dtype)
else:
try:
if conv is None:
r = dtype(field)
elif isinstance(conv, Converters):
r = conv(field, idx)
elif hasattr(conv, '__call__'):
r = conv(field)
else:
r = conv[idx](field)
except:
if not self.loose:
raise ValueError(f"Cannot convert string '{field0}'")
r = self.fill_value(idx, dtype)
# Make sure we copy strings out of the mmap buffer
if isinstance(r, str):
return r.__ptrcopy__()
else:
return r
def convert_static(self, field: str, idx: Static[int],
dtype: type) -> dtype:
field0 = field
conv = self.conv
if self.autostrip:
field = field.strip()
if not field and conv is None:
r = self.fill_value_static(idx, dtype)
else:
try:
if conv is None:
r = dtype(field)
elif isinstance(conv, Converters):
r = conv(field, idx)
elif hasattr(conv, '__call__'):
r = conv(field)
else:
r = conv[idx](field)
except:
if not self.loose:
raise ValueError(f"Cannot convert string '{field0}'")
r = self.fill_value_static(idx, dtype)
# Make sure we copy strings out of the mmap buffer
if isinstance(r, str):
return r.__ptrcopy__()
else:
return r
def __call__(self, fields: Ptr[str], num_fields: int, row: int):
def get_new_size(size: int, min_grow: int = 512):
new_size = size
growth = size >> 2
if growth <= min_grow:
new_size += min_grow
else:
new_size += growth + min_grow - 1
new_size &= ~min_grow
return new_size
arr = self.arr
cols = self.cols
cap = self.cap
if row >= cap:
new_cap = get_new_size(cap)
self.resize_arrays(new_cap)
# Lots of different cases to consider...
if cols is not None:
if isinstance(arr, Tuple):
for i in staticrange(staticlen(cols)):
col = cols[i]
arr[i].data[row] = self.convert_static(
fields[col], i, arr[i].dtype)
else:
if isinstance(arr.dtype, Tuple):
dummy = util.zero(arr.dtype)
tup = tuple(
self.convert(fields[cols[i]], i, type(dummy[i]))
for i in staticrange(staticlen(arr.dtype)))
arr._ptr((row, ))[0] = tup
else:
if arr.ndim == 1:
col = cols[0]
arr.data[row] = self.convert_static(
fields[col], 0, arr.dtype)
else:
for i in staticrange(staticlen(cols)):
col = cols[i]
arr._ptr((row, i))[0] = self.convert_static(
fields[col], i, arr.dtype)
else:
if isinstance(arr, Tuple):
for i in staticrange(staticlen(arr)):
arr[i]._ptr((row, ))[0] = self.convert_static(
fields[i], i, arr[i].dtype)
elif isinstance(arr.dtype, Tuple):
dummy = util.zero(arr.dtype)
tup = tuple(
self.convert(fields[i], i, type(dummy[i]))
for i in staticrange(staticlen(arr.dtype)))
arr._ptr((row, ))[0] = tup
else:
for i in range(num_fields):
arr._ptr(
(row, i))[0] = self.convert(fields[i].strip(), i,
arr.dtype)
self.last_row = row
def genfromtxt(fname,
dtype: type = float,
comments: Optional[str] = '#',
delimiter=None,
skip_header: int = 0,
skip_footer: int = 0,
converters=None,
filling_values=None,
usecols=None,
names=None,
excludelist=None,
deletechars: str = " !#$%&'()*+, -./:;<=>?@[\\]^{|}~",
replace_space: str = '_',
autostrip: bool = False,
case_sensitive=True,
unpack: Static[int] = False,
loose: bool = True,
invalid_raise: bool = True,
max_rows: Optional[int] = None,
ndmin: Static[int] = 0):
def make_callback(csv,
cols,
converters,
filling_values,
loose: bool,
autostrip: bool,
num_fields: int,
block_size: int,
dtype: type,
unpack: Static[int] = False):
if cols is None:
if isinstance(dtype, Tuple):
if staticlen(dtype) != num_fields:
raise ValueError(
"number of fields is different than given tuple 'dtype' length"
)
if unpack:
dummy = util.zero(dtype)
arr = tuple(
empty((block_size, ), type(dummy[i]))
for i in staticrange(staticlen(dtype)))
else:
arr = empty((block_size, ), dtype)
else:
arr = empty((block_size, num_fields), dtype)
xcols = cols
else:
if isinstance(dtype, Tuple):
if staticlen(dtype) != staticlen(cols):
compile_error(
"'usecols' has different length than given tuple 'dtype'"
)
xcols = csv.translate_cols(cols, num_fields)
if staticlen(xcols) == 1:
arr = empty((block_size, ), dtype)
else:
if unpack:
if isinstance(dtype, Tuple):
dummy = util.zero(dtype)
ncols: Static[int] = staticlen(xcols)
arr = tuple(
empty((block_size, ), type(dummy[i]))
for i in staticrange(ncols))
else:
arr = tuple(
empty((block_size, ), dtype) for _ in xcols)
else:
if isinstance(dtype, Tuple):
arr = empty((block_size, ), dtype)
else:
arr = empty((block_size, len(xcols)), dtype)
converters = make_conv(converters, num_fields, xcols, dtype)
callback = ArrayUpdateGen(arr,
cols=xcols,
conv=converters,
filling_values=filling_values,
autostrip=autostrip,
loose=loose)
return callback
def update_names(csv, excludelist, case_sensitive, replace_space: str,
deletechars: str):
if csv._names:
ex = ['return', 'file', 'print']
if excludelist is not None:
ex.extend(excludelist)
upper = False
lower = False
BAD_CASE_SENSITIVE: Static[
str] = "'case_sensitive' must be True, False, 'upper' or 'lower'"
if isinstance(case_sensitive, bool):
if not case_sensitive:
upper = True
elif isinstance(case_sensitive, str):
if case_sensitive == 'upper':
upper = True
elif case_sensitive == 'lower':
lower = True
else:
raise ValueError(BAD_CASE_SENSITIVE)
else:
compile_error(BAD_CASE_SENSITIVE)
csv.fix_names(excludelist=ex,
deletechars=deletechars,
replace_space=replace_space,
upper=upper,
lower=lower)
if isinstance(fname, str):
if fname.endswith('.gz'):
from gzip import open as gz_open
with gz_open(fname, 'rb') as f:
return genfromtxt(f,
dtype=dtype,
comments=comments,
delimiter=delimiter,
skip_header=skip_header,
skip_footer=skip_footer,
converters=converters,
filling_values=filling_values,
usecols=usecols,
names=names,
excludelist=excludelist,
deletechars=deletechars,
replace_space=replace_space,
autostrip=autostrip,
case_sensitive=case_sensitive,
unpack=unpack,
loose=loose,
invalid_raise=invalid_raise,
max_rows=max_rows,
ndmin=ndmin)
elif fname.endswith('.bz2'):
from bz2 import open as bz_open
with bz_open(fname, 'r') as f:
return genfromtxt(f,
dtype=dtype,
comments=comments,
delimiter=delimiter,
skip_header=skip_header,
skip_footer=skip_footer,
converters=converters,
filling_values=filling_values,
usecols=usecols,
names=names,
excludelist=excludelist,
deletechars=deletechars,
replace_space=replace_space,
autostrip=autostrip,
case_sensitive=case_sensitive,
unpack=unpack,
loose=loose,
invalid_raise=invalid_raise,
max_rows=max_rows,
ndmin=ndmin)
if max_rows is not None:
if skip_footer:
raise ValueError(
"The keywords 'skip_footer' and 'max_rows' can not be "
"specified at the same time.")
if max_rows < 1:
raise ValueError("'max_rows' must be at least 1.")
if ndmin != 0 and ndmin != 1 and ndmin != 2:
compile_error("'ndmin' must be 0, 1 or 2")
if delimiter is None:
dm = ''
dx = 0
spaced = False
elif isinstance(delimiter, str):
dm = delimiter
dx = 0
spaced = False
else:
dm = ''
dx = delimiter
spaced = True
if isinstance(usecols, int) or isinstance(usecols, str):
cols = (usecols, )
else:
cols = usecols
if cols is not None:
if staticlen(cols) == 0:
compile_error("cannot pass empty tuple to 'usecols'")
if ndmin != 0 and ndmin != 1 and ndmin != 2:
compile_error("'ndmin' must be 0, 1 or 2")
maxrows: int = max_rows if max_rows is not None else -1
block_size = maxrows if maxrows >= 0 else _DEFAULT_ROWS
if names is None:
get_names = False
given_names = None
elif isinstance(names, bool):
get_names = names
given_names = None
elif isinstance(names, str):
get_names = False
given_names = names.split(',')
for i in range(names):
names[i] = names[i].strip()
elif isinstance(names, List[str]):
get_names = False
given_names = names
else:
get_names = False
given_names = [a for a in names]
if not isinstance(fname, str):
# line-by-line mode
if skip_footer > 0:
raise ValueError(
"'skip_footer' not supported in line-by-line mode")
csv = CSVReaderGen('',
delimiter=dm,
comments=comments if comments is not None else '')
k = 0
row = 0
first = True
num_fields = 0
fields = Ptr[str]()
callback = None
for line in fname:
if k < skip_header or (comments is not None
and line.startswith(comments)):
k += 1
continue
if maxrows >= 0 and row >= maxrows:
break
if first:
if spaced:
num_fields = csv.get_num_fields_single_spaced(
line, get_names=get_names, delimiter=dx)
else:
num_fields = csv.get_num_fields_single(line,
get_names=get_names)
update_names(csv,
excludelist=excludelist,
case_sensitive=case_sensitive,
replace_space=replace_space,
deletechars=deletechars)
callback = make_callback(csv=csv,
cols=cols,
converters=converters,
filling_values=filling_values,
loose=loose,
autostrip=autostrip,
num_fields=num_fields,
block_size=block_size,
dtype=dtype,
unpack=unpack)
fields = Ptr[str](num_fields)
first = False
if not get_names:
if spaced:
row += int(
csv.parse_single_spaced(line, callback, row,
fields, num_fields,
invalid_raise, dx))
else:
row += int(
csv.parse_single(line, callback, row, fields,
num_fields, invalid_raise))
else:
if spaced:
row += int(
csv.parse_single_spaced(line, callback, row, fields,
num_fields, invalid_raise, dx))
else:
row += int(
csv.parse_single(line, callback, row, fields,
num_fields, invalid_raise))
k += 1
if callback is None:
raise ValueError("empty input")
callback.trim_arrays()
arr = callback.arr
if isinstance(arr, ndarray):
if unpack:
return min_dim(arr, ndmin).T
else:
return min_dim(arr, ndmin)
else:
return arr
else:
with CSVReaderGen(
fname,
delimiter=dm,
comments=comments if comments is not None else '') as csv:
if given_names is not None:
csv._names = given_names
csv.find_length(skip_footer)
i = 0
i = csv.skip_lines(i, skip_header)
i = csv.skip_comments(i)
if spaced:
num_fields, i = csv.get_num_fields_spaced(i,
get_names=get_names,
delimiter=dx)
else:
num_fields, i = csv.get_num_fields(i, get_names=get_names)
update_names(csv,
excludelist=excludelist,
case_sensitive=case_sensitive,
replace_space=replace_space,
deletechars=deletechars)
callback = make_callback(csv=csv,
cols=cols,
converters=converters,
filling_values=filling_values,
loose=loose,
autostrip=autostrip,
num_fields=num_fields,
block_size=block_size,
dtype=dtype,
unpack=unpack)
if spaced:
csv.parse_spaced(i,
callback,
num_fields=num_fields,
maxrows=maxrows,
invalid_raise=invalid_raise,
delimiter=dx)
else:
csv.parse(i,
callback,
num_fields=num_fields,
maxrows=maxrows,
invalid_raise=invalid_raise)
callback.trim_arrays()
arr = callback.arr
if isinstance(arr, ndarray):
if unpack:
return min_dim(arr, ndmin).T
else:
return min_dim(arr, ndmin)
else:
return arr
##########
# Pickle #
##########
@extend
class ndarray:
def __pickle__(self, jar: Jar):
from pickle import _write_raw
atomic = util.atomic(self.dtype)
cc, fc = self._contig
forder = (fc and not cc)
int(forder).__pickle__(jar)
for s in self.shape:
s.__pickle__(jar)
if atomic and (fc or cc):
_write_raw(jar, self.data.as_byte(), self.nbytes)
else:
for idx in util.multirange(self.shape):
e = self._ptr(idx)[0]
e.__pickle__(jar)
def __unpickle__(jar: Jar):
from pickle import _read_raw
atomic = util.atomic(dtype)
forder = bool(int.__unpickle__(jar))
shape = tuple(int.__unpickle__(jar) for _ in staticrange(ndim))
n = util.count(shape)
p = Ptr[dtype](n)
if atomic:
_read_raw(jar, p.as_byte(), n * util.sizeof(dtype))
else:
for i in range(n):
p[i] = dtype.__unpickle__(jar)
return ndarray(shape, p, fcontig=forder)