# Copyright (C) 2022-2025 Exaloop Inc. from .ndarray import ndarray from .routines import array, empty, asarray, atleast_2d import util import gzip import re _ALLOCATIONGRANULARITY: Static[int] = 16384 _PROT_NONE: Static[int] = 0 _PROT_READ: Static[int] = 1 _PROT_WRITE: Static[int] = 2 _PROT_EXEC: Static[int] = 4 _MAP_SHARED: Static[int] = 1 _MAP_PRIVATE: Static[int] = 2 _MAGIC_PREFIX: Static[str] = '\x93NUMPY' _ARRAY_ALIGN: Static[int] = 64 ################## # Memory Mapping # ################## def _io_error(base_msg: str): from C import seq_check_errno() -> str c_msg = seq_check_errno() raise IOError(f"{base_msg}: {c_msg}" if c_msg else base_msg) def _mmap(f, length: int, access: int, flags: int, offset: int = 0): from C import mmap(cobj, int, i32, i32, i32, int) -> cobj from C import fileno(cobj) -> i32 fd = fileno(f.fp) mm = mmap(cobj(), length, i32(access), i32(flags), fd, offset) if int(mm) == -1: _io_error("mmap() call failed") return mm def _munmap(p: cobj, length: int): from C import munmap(cobj, int) -> i32 if munmap(p, length) == i32(-1): _io_error("munmap() call failed") def _fix_mmap_mode(mode: str): if mode == 'readonly': mode = 'r' elif mode == 'copyonwrite': mode = 'c' elif mode == 'readwrite': mode = 'r+' elif mode == 'write': mode = 'w+' elif mode not in ('r', 'c', 'r+', 'w+'): raise ValueError( f"mode must be one of ['r', 'c', 'r+', 'w+', 'readonly', 'copyonwrite', 'readwrite', 'write'] (got {repr(mode)})" ) return mode def _memmap(fid, dtype: type, mode: str, offset: int, shape, forder: bool): fid.seek(0, 2) flen = fid.tell() dbytes = util.sizeof(dtype) if shape is None: nbytes = flen - offset if nbytes % dbytes: raise ValueError("Size of available data is not a " "multiple of the data-type size.") size = nbytes // dbytes sh = (size, ) else: if isinstance(shape, int): sh = (shape, ) else: sh = shape size = util.count(shape) nbytes = offset + size * dbytes if mode in ('w+', 'r+') and flen < nbytes: fid.seek(nbytes - 1, 0) fid.write('\0') fid.flush() if mode == 'c': flags = _MAP_PRIVATE acc = _PROT_READ | _PROT_WRITE elif mode == 'r': flags = _MAP_SHARED acc = _PROT_READ else: flags = _MAP_SHARED acc = _PROT_READ | _PROT_WRITE start = offset - offset % _ALLOCATIONGRANULARITY nbytes -= start array_offset = offset - start mm = _mmap(fid, length=nbytes, access=acc, flags=flags, offset=start) return ndarray(sh, Ptr[dtype](mm + array_offset), fcontig=forder) def memmap(filename, dtype: type = u8, mode: str = 'r+', offset: int = 0, shape=None, order: str = 'C'): mode = _fix_mmap_mode(mode) if mode == 'w+' and shape is None: raise ValueError("shape must be given if mode == 'w+'") if offset < 0: raise ValueError( f"memmap() 'offset' cannot be negative (got {offset})") forder = False if order == 'C' or order == 'c': forder = False elif order == 'F' or order == 'f': forder = True else: raise ValueError(f"memmap() 'order' must be 'C' or 'F' (got {order})") if hasattr(filename, 'read'): return _memmap(filename, dtype=dtype, mode=mode, offset=offset, shape=shape, forder=forder) elif isinstance(filename, str): with open(filename, ('r' if mode == 'c' else mode) + 'b') as f: return _memmap(f, dtype=dtype, mode=mode, offset=offset, shape=shape, forder=forder) else: compile_error("filename must be a string or file handle") ############### # General I/O # ############### def parse_header(header: str): regex = ( r"{\s*" r"(?:" r"(?:'descr'|\"descr\")\s*:\s*'(?P.*?)'\s*,?\s*" r"|" r"(?:'fortran_order'|\"fortran_order\")\s*:\s*(?PTrue|False)\s*,?\s*" r"|" r"(?:'shape'|\"shape\")\s*:\s*(?P\(\)|\(\d+(?:, \d+)*(?:,)?\))\s*,?\s*" r"){3}" r"\s*}") m = re.match(regex, header) if m: # Use the named groups to access the matched values dtype = m.group(1) fortran_order = m.group(2) shape = m.group(3) return shape, fortran_order, dtype else: raise ValueError(f"Cannot parse header: {repr(header)}") def _load(f, mmap_mode: Optional[str], ndim: Static[int], dtype: type): magic = f.read(len(_MAGIC_PREFIX)) if magic != _MAGIC_PREFIX: raise ValueError("Invalid magic string.") # Extract the major and minor version numbers major = f.read(1) minor = f.read(1) if (major != '\x01' and major != '\x02' and major != '\x03') or minor != '\x00': raise ValueError("Invalid version numbers.") # Extract the header length as a little-endian unsigned int if major == '\x01': header_len_enc = f.read(2) header_len = (int(header_len_enc.ptr[1]) << 8) | int( header_len_enc.ptr[0]) else: header_len_enc = f.read(4) header_len = ((int(header_len_enc.ptr[3]) << 24) | (int(header_len_enc.ptr[2]) << 16) | (int(header_len_enc.ptr[1]) << 8) | int(header_len_enc.ptr[0])) # Read the header data header_data = f.read(header_len) # Deserialize the header dictionary data shape, fortran_order, dt = parse_header(header_data) # Extract shape information from the header data elements = [ elem.strip() for elem in shape[1:-1].split(',') if elem.strip() ] if len(elements) != ndim: raise ValueError( f"Loaded array has dimension {len(elements)}, but expected dimension {ndim} (specified by 'ndim' argument)" ) shape = tuple(int(elements[i]) for i in staticrange(ndim)) forder = (fortran_order == 'True') # Read the binary data if mmap_mode is None: str_data = f.read() binary_data = str_data.ptr got_bytes = len(str_data) else: arr_data = _memmap(f, dtype=byte, mode=mmap_mode, offset=f.tell(), shape=None, forder=forder) binary_data = arr_data.data got_bytes = arr_data.size exp_bytes = util.count(shape) * util.sizeof(dtype) if got_bytes != exp_bytes: raise ValueError( f"Unexpected number of bytes read from file for given array shape and dtype (expected {exp_bytes} but got {got_bytes})" ) # Create a ndarray from the binary data data = Ptr[dtype](binary_data) array = ndarray[dtype, ndim](shape, data, fcontig=forder) if dt.startswith('>') and mmap_mode is None: array.byteswap(inplace=True) return array def load(file, mmap_mode: Optional[str] = None, ndim: Static[int] = 1, dtype: type = float): if mmap_mode is not None: mmap_mode = _fix_mmap_mode(mmap_mode) if mmap_mode == 'w+': raise ValueError("cannot use mmap_mode='w+' in load()") if hasattr(file, 'read'): return _load(file, mmap_mode=mmap_mode, ndim=ndim, dtype=dtype) elif isinstance(file, str): open_mode = 'rb' if mmap_mode is not None: open_mode = ('r' if mmap_mode == 'c' else mmap_mode) + 'b' with open(file, open_mode) as f: return _load(f, mmap_mode=mmap_mode, ndim=ndim, dtype=dtype) else: compile_error("fname must be a string or file handle") def _save(f, arr): arr = asarray(arr) cc, fc = arr._contig fortran_order = (fc and not cc) header_parts = ("{'descr': '", util.dtype_to_str(arr.dtype, include_byteorder=True), "', 'fortran_order': ", "True, 'shape': " if fortran_order else "False, 'shape': ", str(arr.shape), ", }") header_len = sum(len(h) for h in header_parts) + 1 # +1 for newline long_header = False if header_len > 0xffffffff: raise ValueError("Header is too long for .npy format") elif header_len > 0xffff: long_header = True else: long_header = False f.write(_MAGIC_PREFIX) f.write('\x02\x00' if long_header else '\x01\x00') m = len(_MAGIC_PREFIX) + 2 + (4 if long_header else 2) + header_len rem = m % _ARRAY_ALIGN spaces = (_ARRAY_ALIGN - rem) if rem else 0 header_len += spaces if long_header: byte1 = byte(header_len & 0xFF) byte2 = byte((header_len >> 8) & 0xFF) byte3 = byte((header_len >> 16) & 0xFF) byte4 = byte((header_len >> 24) & 0xFF) f.write(str(__ptr__(byte1), 1)) f.write(str(__ptr__(byte2), 1)) f.write(str(__ptr__(byte3), 1)) f.write(str(__ptr__(byte4), 1)) else: byte1 = byte(header_len & 0xFF) byte2 = byte((header_len >> 8) & 0xFF) f.write(str(__ptr__(byte1), 1)) f.write(str(__ptr__(byte2), 1)) for h in header_parts: f.write(h) for _ in range(spaces): f.write('\x20') f.write('\n') if cc or fc: f.write(str(arr.data.as_byte(), arr.nbytes)) else: for idx in util.multirange(arr.shape): e = arr._ptr(idx)[0] s = str(__ptr__(e).as_byte(), util.sizeof(arr.dtype)) f.write(s) def save(file, arr): if hasattr(file, 'write'): _save(file, arr) elif isinstance(file, str): if not file.endswith('.npy'): file += '.npy' with open(file, 'wb') as f: _save(f, arr) else: compile_error("fname must be a string or file handle") def _savetxt(f, X, delimiter: str, newline: str, header: str, footer: str, comments: str): X = asarray(X) if header: header = header.replace('\n', '\n' + comments) f.write(comments) f.write(header) f.write(newline) if X.ndim == 1: for x in X: f.write(str(x)) f.write(newline) elif X.ndim == 2: m, n = X.shape if m and n: for i in range(m): for j in range(n): x = X._ptr((i, j))[0] f.write(str(x)) if j < n - 1: f.write(delimiter) f.write(newline) else: compile_error("Expected 1D or 2D array") if footer: footer = footer.replace('\n', '\n' + comments) f.write(comments) f.write(footer) f.write(newline) def savetxt(fname, X, delimiter: str = ' ', newline: str = '\n', header: str = '', footer: str = '', comments: str = '# '): if hasattr(fname, 'write'): _savetxt(fname, X, delimiter=delimiter, newline=newline, header=header, footer=footer, comments=comments) elif isinstance(fname, str): if fname.endswith('.gz'): with gzip.open(fname, 'w9') as f: _savetxt(f, X, delimiter=delimiter, newline=newline, header=header, footer=footer, comments=comments) else: with open(fname, 'w') as f: _savetxt(f, X, delimiter=delimiter, newline=newline, header=header, footer=footer, comments=comments) else: compile_error("fname must be a string or a file handle") def _fromfile(f, dtype: type, count: int, sep: str, offset: int): if sep: if offset: raise TypeError( "'offset' argument only permitted for binary files") string_data = f.read() if sep.isspace(): string_split = string_data.split(None, count) else: string_split = string_data.split(sep, count) n = len(string_split) p = Ptr[dtype](n) for i in range(n): p[i] = dtype(string_split[i]) else: if offset: f.seek(offset, 1) if count < 0: binary_data = f.read() else: binary_data = f.read(count * util.sizeof(dtype)) p = Ptr[dtype](binary_data.ptr) n = len(binary_data) // util.sizeof(dtype) return ndarray((n, ), p) def fromfile(file, dtype: type = float, count: int = -1, sep: str = '', offset: int = 0): if hasattr(file, 'read'): return _fromfile(file, dtype=dtype, count=count, sep=sep, offset=offset) elif isinstance(file, str): with open(file, 'rb') as f: return _fromfile(f, dtype=dtype, count=count, sep=sep, offset=offset) else: compile_error("fname must be a string or a file handle") def fromstring(string: str, dtype: type = float, count: int = -1, sep: str = ''): if sep: split = string.split(sep, count) k = len(split) n = count if count >= 0 else k result = empty((n, ), dtype=dtype) p = result.data for i in range(k if count < 0 else min(k, count)): p[i] = dtype(split[i]) return result else: if count < 0: if len(string) < util.sizeof(dtype): raise ValueError("string is smaller than requested size") if len(string) % util.sizeof(dtype) != 0: raise ValueError( "string size must be a multiple of element size") else: if len(string) < count * util.sizeof(dtype): raise ValueError("string is smaller than requested size") n = count if count >= 0 else (len(string) // util.sizeof(dtype)) return ndarray((n, ), Ptr[dtype](string.ptr)) def _tofile(arr, f, sep: str): if sep: k = 0 n = arr.size for idx in util.multirange(arr.shape): e = arr._ptr(idx)[0] f.write(str(e)) if k < n - 1: f.write(sep) k += 1 else: cc, _ = arr._contig if cc: f.write(str(arr.data.as_byte(), arr.nbytes)) else: for idx in util.multirange(arr.shape): e = arr._ptr(idx)[0] s = str(__ptr__(e).as_byte(), util.sizeof(arr.dtype)) f.write(s) @extend class ndarray: def tofile(self, file, sep: str = ''): if hasattr(file, 'write'): _tofile(self, file, sep=sep) elif isinstance(file, str): with open(file, 'w' if sep else 'wb') as f: _tofile(self, f, sep=sep) else: compile_error("fname must be a string or file handle") ######################## # loadtxt / genfromtxt # ######################## _NEWLINE: Static[int] = 10 _CARTRIDGE: Static[int] = 13 _DEFAULT_ROWS: Static[int] = 512 @tuple class Converters: funcs: F mask: M usecols: U dtype: type F: type M: type U: type def __new__(funcs, mask, usecols, dtype: type) -> Converters[dtype, F, M, U]: return (funcs, mask, usecols) def __call__(self, field: str, idx: int): usecols = self.usecols if usecols is not None: idx = usecols[idx] if self.mask[idx]: return self.funcs[idx](field) else: return self.dtype(field) def normalize_col(col: int, num_fields: int): if col >= num_fields or col < -num_fields: raise ValueError( f"given column {col} is out of bounds (file has {num_fields} columns)" ) elif col < 0: return col + num_fields else: return col def default_fill(dtype: type): if dtype is bool: return False elif dtype is int or isinstance(dtype, Int) or isinstance(dtype, UInt): return dtype(-1) elif dtype is float or dtype is float32 or dtype is float16: return util.nan(dtype) elif dtype is complex: return complex(util.nan64(), 0.0) elif dtype is complex64: return complex64(util.nan32(), float32(0.0)) elif dtype is str: return '???' else: return util.zero(dtype) def malformed(row: int, num_fields: int): raise IOError( f"inconsistent number of fields in file (row = {row}, expected fields = {num_fields})" ) def min_dim(arr: ndarray, ndmin: Static[int]): if arr.ndim == 1 and ndmin == 2: return arr.reshape(arr.size, 1) else: return arr def make_conv(converters, num_fields: int, usecols, dtype: type): if isinstance(converters, Dict): funcs = Ptr[converters.V](num_fields) mask = Ptr[bool](num_fields) for i in range(num_fields): mask[i] = False for k, v in converters.items(): col = normalize_col(k, num_fields) mask[col] = True funcs[col] = v return Converters[dtype, type(funcs), type(mask), type(usecols)](funcs, mask, usecols, dtype) else: return converters class CSVReader: _path: str _delimiter: byte _quotechar: byte _comments: str _mmap_ptr: cobj _mmap_len: int def __init__(self, path: str, delimiter: str = ',', quotechar: str = '"', comments: str = ''): dm = byte(0) if len(delimiter) == 1: dm = delimiter.ptr[0] elif len(delimiter) != 0: raise ValueError("'delimiter' must be a length-1 string") qc = byte(0) if len(quotechar) == 1: qc = quotechar.ptr[0] elif len(quotechar) != 0: raise ValueError("'quotechar' must be a length-1 string") self._path = path self._delimiter = dm self._quotechar = qc self._comments = comments self._mmap_ptr = cobj() self._mmap_len = 0 def __enter__(self): with open(self._path) as f: f.seek(0, 2) n = f.tell() if n > 0: self._mmap_ptr = _mmap(f, length=n, access=_PROT_READ, flags=_MAP_SHARED) self._mmap_len = n if int(self._mmap_ptr) == -1: raise IOError("CSVReader error: mmap() failed") def __exit__(self): if self._mmap_len > 0: _munmap(self._mmap_ptr, self._mmap_len) self._mmap_ptr = cobj() self._mmap_len = 0 def is_delimiter(self, c: byte): delimiter = self._delimiter if delimiter: return c == delimiter else: return bool(_C.isspace(i32(int(c)))) def is_comment(self, c: byte): comments = self._comments for i in range(len(comments)): if comments.ptr[i] == c: return True return False def skip_delimiter(self, i: int): delimiter = self._delimiter i += 1 # Single-char case if delimiter: return i # Whitespace case p = self._mmap_ptr n = self._mmap_len while i < n: c = p[i] if (c == byte(_NEWLINE) or c == byte(_CARTRIDGE) or not self.is_delimiter(c)): break i += 1 return i def skip_comments(self, i: int): if not self._comments: return i p = self._mmap_ptr n = self._mmap_len while i < n: c = p[i] if self.is_comment(c): i += 1 while i < n: c = p[i] i += 1 if c == byte(_NEWLINE) or c == byte(_CARTRIDGE): if c == byte(_CARTRIDGE) and i < n and p[i] == byte( _NEWLINE): i += 1 break else: break return i def skip_lines(self, i: int, skip: int): p = self._mmap_ptr n = self._mmap_len skipped = 0 while i < n and skipped < skip: c = p[i] if (c == byte(_NEWLINE) or c == byte(_CARTRIDGE)): i += 1 if c == byte(_CARTRIDGE) and i < n and p[i] == byte(_NEWLINE): i += 1 skipped += 1 else: i += 1 return i def get_num_fields(self, i0: int): p = self._mmap_ptr n = self._mmap_len quotechar = self._quotechar comments = self._comments if n == 0: return 0 quote = False num_fields = 1 i = self.skip_comments(i0) if i >= n: return 0 # Parse first row to get field count while i < n: c = p[i] if quotechar and c == quotechar: if quote: if i + 1 < n and p[i + 1] == quotechar: i += 2 else: quote = False i += 1 else: quote = True i += 1 elif (c == byte(_NEWLINE) or c == byte(_CARTRIDGE)) and not quote: i += 1 if c == byte(_CARTRIDGE) and i < n and p[i] == byte(_NEWLINE): i += 1 break elif self.is_delimiter(c) and not quote: num_fields += 1 i = self.skip_delimiter(i) elif self.is_comment(c) and not quote: break else: i += 1 return num_fields def parse(self, row_callback, num_fields: int, maxrows: int, i0: int): p = self._mmap_ptr n = self._mmap_len quotechar = self._quotechar if n == 0 or num_fields == 0 or maxrows == 0: return quote = False field = 0 fields = Ptr[str](num_fields) row = 0 i = self.skip_comments(i0) last_field_start = i while i < n: c = p[i] if quotechar and c == quotechar: if quote: if i + 1 < n and p[i + 1] == quotechar: i += 2 else: quote = False i += 1 else: quote = True i += 1 elif (c == byte(_NEWLINE) or c == byte(_CARTRIDGE)) and not quote: if field != num_fields - 1: malformed(row, num_fields) fields[field] = str(p + last_field_start, i - last_field_start) row_callback(fields, num_fields, row) i += 1 if c == byte(_CARTRIDGE) and i < n and p[i] == byte(_NEWLINE): i += 1 i = self.skip_comments(i) last_field_start = i field = 0 row += 1 if maxrows >= 0 and row >= maxrows: break elif self.is_delimiter(c) and not quote: fields[field] = str(p + last_field_start, i - last_field_start) field += 1 if field >= num_fields: malformed(row, num_fields) i = self.skip_delimiter(i) last_field_start = i elif self.is_comment(c) and not quote: if field != num_fields - 1: malformed(row, num_fields) fields[field] = str(p + last_field_start, i - last_field_start) row_callback(fields, num_fields, row) i = self.skip_comments(i) last_field_start = i field = 0 row += 1 if maxrows >= 0 and row >= maxrows: break else: i += 1 if field > 0: if field != num_fields - 1: malformed(row, num_fields) if maxrows < 0 or row <= maxrows: fields[field] = str(p + last_field_start, i - last_field_start) row_callback(fields, num_fields, row) class ArrayUpdate: arr: A cols: C conv: F last_row: int A: type C: type F: type def __init__(self, arr: A, cols: C, conv: F): self.arr = arr self.cols = cols self.conv = conv self.last_row = 0 @property def cap(self): arr = self.arr if isinstance(arr, Tuple): return arr[0].size else: return arr.shape[0] def resize_arrays(self, new_cap: int): def resize_one(a, new_cap: int): if a.ndim == 1: data_new = util.realloc(a.data, new_cap, a.size) return ndarray((new_cap, ), data_new) else: rows, cols = a.shape data_new = util.realloc(a.data, new_cap * cols, a.size) return ndarray((new_cap, cols), data_new) arr = self.arr if isinstance(arr, Tuple): self.arr = tuple(resize_one(a, new_cap) for a in arr) else: self.arr = resize_one(arr, new_cap) def trim_arrays(self): if self.last_row < self.cap - 1: self.resize_arrays(self.last_row + 1) def convert(self, field: str, idx: int, dtype: type) -> dtype: conv = self.conv if conv is None: r = dtype(field) elif isinstance(conv, Converters): r = conv(field, idx) elif hasattr(conv, '__call__'): r = conv(field) else: r = conv[idx](field) # Make sure we copy strings out of the mmap buffer if isinstance(r, str): return r.__ptrcopy__() else: return r def convert_static(self, field: str, idx: Static[int], dtype: type) -> dtype: conv = self.conv if conv is None: r = dtype(field) elif isinstance(conv, Converters): r = conv(field, idx) elif hasattr(conv, '__call__'): r = conv(field) else: r = conv[idx](field) # Make sure we copy strings out of the mmap buffer if isinstance(r, str): return r.__ptrcopy__() else: return r def __call__(self, fields: Ptr[str], num_fields: int, row: int): def get_new_size(size: int, min_grow: int = 512): new_size = size growth = size >> 2 if growth <= min_grow: new_size += min_grow else: new_size += growth + min_grow - 1 new_size &= ~min_grow return new_size arr = self.arr cols = self.cols cap = self.cap if row >= cap: new_cap = get_new_size(cap) self.resize_arrays(new_cap) # Lots of different cases to consider... if cols is not None: if isinstance(arr, Tuple): for i in staticrange(staticlen(cols)): col = cols[i] arr[i].data[row] = self.convert_static( fields[col], i, arr[i].dtype) else: if isinstance(arr.dtype, Tuple): dummy = util.zero(arr.dtype) tup = tuple( self.convert(fields[cols[i]], i, type(dummy[i])) for i in staticrange(staticlen(arr.dtype))) arr._ptr((row, ))[0] = tup else: if arr.ndim == 1: col = cols[0] arr.data[row] = self.convert_static( fields[col], 0, arr.dtype) else: for i in staticrange(staticlen(cols)): col = cols[i] arr._ptr((row, i))[0] = self.convert_static( fields[col], i, arr.dtype) else: if isinstance(arr, Tuple): for i in staticrange(staticlen(arr)): arr[i]._ptr((row, ))[0] = self.convert_static( fields[i], i, arr[i].dtype) elif isinstance(arr.dtype, Tuple): dummy = util.zero(arr.dtype) tup = tuple( self.convert(fields[i], i, type(dummy[i])) for i in staticrange(staticlen(arr.dtype))) arr._ptr((row, ))[0] = tup else: for i in range(num_fields): arr._ptr( (row, i))[0] = self.convert(fields[i], i, arr.dtype) self.last_row = row def loadtxt(fname: str, dtype: type = float, comments: Optional[str] = '#', delimiter: Optional[str] = None, converters=None, skiprows: int = 0, usecols=None, unpack: Static[int] = False, ndmin: Static[int] = 0, max_rows: Optional[int] = None, quotechar: Optional[str] = None): if isinstance(usecols, int): cols = (usecols, ) else: cols = usecols if cols is not None: if staticlen(cols) == 0: compile_error("cannot pass empty tuple to 'usecols'") if ndmin != 0 and ndmin != 1 and ndmin != 2: compile_error("'ndmin' must be 0, 1 or 2") maxrows: int = max_rows if max_rows is not None else -1 block_size = maxrows if maxrows >= 0 else _DEFAULT_ROWS with CSVReader(fname, delimiter=(delimiter if delimiter is not None else ''), quotechar=(quotechar if quotechar is not None else ''), comments=comments if comments is not None else '') as csv: i0 = csv.skip_lines(i=0, skip=skiprows) if skiprows > 0 else 0 num_fields = csv.get_num_fields(i0) if cols is None: if isinstance(dtype, Tuple): if staticlen(dtype) != num_fields: raise ValueError( "number of fields is different than given tuple 'dtype' length" ) if unpack: dummy = util.zero(dtype) arr = tuple( empty((block_size, ), type(dummy[i])) for i in staticrange(staticlen(dtype))) else: arr = empty((block_size, ), dtype) else: arr = empty((block_size, num_fields), dtype) else: if isinstance(dtype, Tuple): if staticlen(dtype) != staticlen(cols): compile_error( "'usecols' has different length than given tuple 'dtype'" ) cols = tuple(normalize_col(col, num_fields) for col in cols) for i in range(1, len(cols)): for j in range(i): if cols[i] == cols[j]: raise ValueError( f"duplicate column {cols[i]} given in 'usecols'") if staticlen(cols) == 1: arr = empty((block_size, ), dtype) else: if unpack: if isinstance(dtype, Tuple): dummy = util.zero(dtype) ncols: Static[int] = staticlen(cols) arr = tuple( empty((block_size, ), type(dummy[i])) for i in staticrange(ncols)) else: arr = tuple(empty((block_size, ), dtype) for _ in cols) else: if isinstance(dtype, Tuple): arr = empty((block_size, ), dtype) else: arr = empty((block_size, len(cols)), dtype) converters = make_conv(converters, num_fields, cols, dtype) callback = ArrayUpdate(arr, cols, converters) csv.parse(callback, num_fields=num_fields, maxrows=maxrows, i0=i0) callback.trim_arrays() arr = callback.arr if isinstance(arr, ndarray): if unpack: return min_dim(arr, ndmin).T else: return min_dim(arr, ndmin) else: return arr class CSVReaderGen: _path: str _delimiter: byte _comments: str _names: List[str] _mmap_ptr: cobj _mmap_len: int _length: int def __init__(self, path: str, delimiter: str = ',', comments: str = ''): dm = byte(0) if len(delimiter) == 1: dm = delimiter.ptr[0] elif len(delimiter) != 0: raise ValueError("'delimiter' must be a length-1 string") self._path = path self._delimiter = dm self._comments = comments self._names = [] self._mmap_ptr = cobj() self._mmap_len = 0 self._length = 0 def __enter__(self): if not self._path: return with open(self._path) as f: f.seek(0, 2) n = f.tell() if n > 0: self._mmap_ptr = _mmap(f, length=n, access=_PROT_READ, flags=_MAP_SHARED) self._mmap_len = n if int(self._mmap_ptr) == -1: raise IOError("CSVReader error: mmap() failed") def __exit__(self): if self._mmap_len > 0: _munmap(self._mmap_ptr, self._mmap_len) self._mmap_ptr = cobj() self._mmap_len = 0 def fix_names(self, excludelist: List[str], deletechars: str, replace_space: str, upper: bool, lower: bool): def fix_name(name: str, excludelist: List[str], deletechars: str, replace_space: str, upper: bool, lower: bool): s = _strbuf(capacity=len(name)) for c in name: if c in deletechars: continue elif c.isspace(): s.append(replace_space) elif upper: s.append(c.upper()) elif lower: s.append(c.lower()) else: s.append(c) if s.__str__() in excludelist: s.append('_') return s.__str__() names = self._names for i in range(len(names)): names[i] = fix_name(names[i], excludelist=excludelist, deletechars=deletechars, replace_space=replace_space, upper=upper, lower=lower) def is_delimiter(self, c: byte): delimiter = self._delimiter if delimiter: return c == delimiter else: return bool(_C.isspace(i32(int(c)))) def is_comment(self, c: byte): comments = self._comments for i in range(len(comments)): if comments.ptr[i] == c: return True return False def skip_lines(self, i: int, skip: int): p = self._mmap_ptr n = self._length skipped = 0 while i < n and skipped < skip: c = p[i] if (c == byte(_NEWLINE) or c == byte(_CARTRIDGE)): i += 1 if c == byte(_CARTRIDGE) and i < n and p[i] == byte(_NEWLINE): i += 1 skipped += 1 else: i += 1 return i def find_length(self, skip_footer: int): p = self._mmap_ptr n = self._mmap_len skipped = 0 if skip_footer <= 0: self._length = n return i = n - 1 # Newline at the very end doesn't count if i >= 0 and p[i] == byte(_NEWLINE): i -= 1 while i >= 0: c = p[i] if c == byte(_NEWLINE): skipped += 1 if skipped == skip_footer: self._length = i + 1 return i -= 1 self._length = 0 def skip_delimiter(self, i: int): delimiter = self._delimiter i += 1 # Single-char case if delimiter: return i # Whitespace case p = self._mmap_ptr n = self._length while i < n: c = p[i] if not self.is_delimiter(c): break i += 1 return i def skip_delimiter(self, i: int, line: str): delimiter = self._delimiter i += 1 # Single-char case if delimiter: return i # Whitespace case p = line.ptr n = len(line) while i < n: c = p[i] if not self.is_delimiter(c): break i += 1 return i def skip_comments(self, i: int): if not self._comments: return i p = self._mmap_ptr n = self._length while i < n: c = p[i] if self.is_comment(c): i += 1 while i < n: c = p[i] i += 1 if c == byte(_NEWLINE) or c == byte(_CARTRIDGE): if c == byte(_CARTRIDGE) and i < n and p[i] == byte( _NEWLINE): i += 1 break else: break return i def get_num_fields(self, i: int, get_names: bool): p = self._mmap_ptr n = self._length comments = self._comments names = self._names if i >= n: return 0, i i0 = i num_fields = 1 if self.is_comment(p[0]): i += 1 last_field_start = i while i < n: c = p[i] if c == byte(_NEWLINE) or c == byte(_CARTRIDGE) or self.is_comment( c): if get_names: name = str(p + last_field_start, i - last_field_start) names.append(name.strip()) i += 1 if c == byte(_CARTRIDGE) and i < n and p[i] == byte(_NEWLINE): i += 1 break if self.is_delimiter(c): if get_names: name = str(p + last_field_start, i - last_field_start) names.append(name.strip()) num_fields += 1 i = self.skip_delimiter(i) last_field_start = i else: i += 1 return num_fields, i if get_names else i0 def partition_line(self, line: str, delimiter, num_fields: int, row: int, invalid_raise: bool): if isinstance(delimiter, int): n = 0 i = 0 while i < len(line) and n < num_fields: yield line[i:i + delimiter] i += delimiter n += 1 if n < num_fields: if invalid_raise: malformed(row, num_fields) while True: yield '' n += 1 if n >= num_fields: break else: n = 0 i = 0 while i < len(line) and n < len(delimiter): d = delimiter[n] if not isinstance(d, int): compile_error( "'delimiter' must be an int or a sequence of ints") yield line[i:i + d] i += d n += 1 if n < num_fields: if invalid_raise: malformed(row, num_fields) while True: yield '' n += 1 if n >= num_fields: break def get_num_fields_spaced(self, i: int, get_names: bool, delimiter): p = self._mmap_ptr n = self._length comments = self._comments names = self._names if i >= n: return 0, i i0 = i if self.is_comment(p[0]): i += 1 line_start = i line = '' while i < n: c = p[i] if c == byte(_NEWLINE) or c == byte(_CARTRIDGE) or self.is_comment( c): line = str(p + line_start, i - line_start) i += 1 if c == byte(_CARTRIDGE) and i < n and p[i] == byte(_NEWLINE): i += 1 break else: i += 1 num_fields = 0 if isinstance(delimiter, int): num_fields = len(line) // delimiter + (1 if len(line) % delimiter else 0) else: num_fields = len(delimiter) if get_names: for name in self.partition_line(line, delimiter=delimiter, num_fields=num_fields, row=0, invalid_raise=False): names.append(name) return num_fields, i if get_names else i0 def get_num_fields_single(self, line: str, get_names: bool): p = line.ptr n = len(line) names = self._names i = 0 if n > 0 and self.is_comment(p[0]): i += 1 last_field_start = i num_fields = 0 while i < n: c = p[i] if c == byte(_NEWLINE) or c == byte(_CARTRIDGE) or self.is_comment( c): if get_names: name = str(p + last_field_start, i - last_field_start) names.append(name.strip()) num_fields += 1 break elif self.is_delimiter(c): if get_names: name = str(p + last_field_start, i - last_field_start) names.append(name.strip()) num_fields += 1 i = self.skip_delimiter(i, line) last_field_start = i else: i += 1 return num_fields def get_num_fields_single_spaced(self, line: str, get_names: bool, delimiter): p = line.ptr n = len(line) names = self._names i = 0 if n > 0 and self.is_comment(p[0]): i += 1 line_start = i while i < n: c = p[i] if c == byte(_NEWLINE) or c == byte(_CARTRIDGE) or self.is_comment( c): line = str(p + line_start, i - line_start) break else: i += 1 if isinstance(delimiter, int): num_fields = len(line) // delimiter + (1 if len(line) % delimiter else 0) else: num_fields = len(delimiter) if get_names: for name in self.partition_line(line, delimiter=delimiter, num_fields=num_fields, row=0, invalid_raise=False): names.append(name) return num_fields def translate_cols(self, usecols, num_fields: int): def translate_one(self, c, num_fields: int): if isinstance(c, int): return normalize_col(c, num_fields) elif isinstance(c, str): return self._names.index(c) else: compile_error("'usecols' elements must be either int or str") cols = tuple(translate_one(self, c, num_fields) for c in usecols) for i in range(1, len(cols)): for j in range(i): if cols[i] == cols[j]: raise ValueError( f"duplicate column {cols[i]} given in 'usecols'") return cols def parse(self, i: int, row_callback, num_fields: int, maxrows: int, invalid_raise: bool): p = self._mmap_ptr n = self._length if n == 0 or num_fields == 0 or maxrows == 0: return field = 0 fields = Ptr[str](num_fields) row = 0 last_field_start = i while i < n: c = p[i] if (c == byte(_NEWLINE) or c == byte(_CARTRIDGE)): ok = True if field != num_fields - 1: if invalid_raise: malformed(row, num_fields) else: ok = False if ok: fields[field] = str(p + last_field_start, i - last_field_start) row_callback(fields, num_fields, row) i += 1 if c == byte(_CARTRIDGE) and i < n and p[i] == byte(_NEWLINE): i += 1 i = self.skip_comments(i) last_field_start = i field = 0 row += 1 if maxrows >= 0 and row >= maxrows: break elif self.is_delimiter(c): if invalid_raise or field < num_fields: fields[field] = str(p + last_field_start, i - last_field_start) field += 1 ok = True if field >= num_fields: if invalid_raise: malformed(row, num_fields) else: ok = False i = self.skip_delimiter(i) last_field_start = i elif self.is_comment(c): ok = True if field != num_fields - 1: if invalid_raise: malformed(row, num_fields) else: ok = False if ok: fields[field] = str(p + last_field_start, i - last_field_start) row_callback(fields, num_fields, row) i = self.skip_comments(i) last_field_start = i field = 0 row += 1 if maxrows >= 0 and row >= maxrows: break else: i += 1 if field > 0: ok = True if field != num_fields - 1: if invalid_raise: malformed(row, num_fields) else: ok = False if ok and (maxrows < 0 or row <= maxrows): fields[field] = str(p + last_field_start, i - last_field_start) row_callback(fields, num_fields, row) def parse_spaced(self, i: int, row_callback, num_fields: int, maxrows: int, invalid_raise: bool, delimiter): p = self._mmap_ptr n = self._length if n == 0 or num_fields == 0 or maxrows == 0: return fields = Ptr[str](num_fields) row = 0 last_line_start = i while i < n: c = p[i] if (c == byte(_NEWLINE) or c == byte(_CARTRIDGE)): line = str(p + last_line_start, i - last_line_start) k = 0 for f in self.partition_line(line, delimiter=delimiter, num_fields=num_fields, row=row, invalid_raise=invalid_raise): fields[k] = f k += 1 row_callback(fields, num_fields, row) i += 1 if c == byte(_CARTRIDGE) and i < n and p[i] == byte(_NEWLINE): i += 1 i = self.skip_comments(i) last_line_start = i row += 1 if maxrows >= 0 and row >= maxrows: break elif self.is_comment(c): line = str(p + last_line_start, i - last_line_start) k = 0 for f in self.partition_line(line, delimiter=delimiter, num_fields=num_fields, row=row, invalid_raise=invalid_raise): fields[k] = f k += 1 row_callback(fields, num_fields, row) i = self.skip_comments(i) last_line_start = i row += 1 if maxrows >= 0 and row >= maxrows: break else: i += 1 if last_line_start < i and (maxrows < 0 or row < maxrows): line = str(p + last_line_start, i - last_line_start) k = 0 for f in self.partition_line(line, delimiter=delimiter, num_fields=num_fields, row=row, invalid_raise=invalid_raise): fields[k] = f k += 1 row_callback(fields, num_fields, row) def parse_single(self, line: str, row_callback, row: int, fields: Ptr[str], num_fields: int, invalid_raise: bool): if not line: return False p = line.ptr n = len(line) i = 0 last_field_start = 0 field = 0 if n > 0 and self.is_comment(p[0]): return False while i < n: c = p[i] if c == byte(_NEWLINE) or c == byte(_CARTRIDGE) or self.is_comment( c): break elif self.is_delimiter(c): if field >= num_fields: if invalid_raise: malformed(row, num_fields) else: return False fields[field] = str(p + last_field_start, i - last_field_start) field += 1 i = self.skip_delimiter(i, line) last_field_start = i else: i += 1 if i > last_field_start: if field >= num_fields: if invalid_raise: malformed(row, num_fields) else: return False fields[field] = str(p + last_field_start, i - last_field_start) field += 1 if field != num_fields: if invalid_raise: malformed(row, num_fields) else: return False row_callback(fields, num_fields, row) return True def parse_single_spaced(self, line: str, row_callback, row: int, fields: Ptr[str], num_fields: int, invalid_raise: bool, delimiter): if not line: return False p = line.ptr n = len(line) i = 0 if n > 0 and self.is_comment(p[0]): return False while i < n: c = p[i] if c == byte(_NEWLINE) or c == byte(_CARTRIDGE) or self.is_comment( c): line = line[:i] break else: i += 1 k = 0 for f in self.partition_line(line, delimiter=delimiter, num_fields=num_fields, row=row, invalid_raise=invalid_raise): fields[k] = f k += 1 row_callback(fields, num_fields, row) return True class ArrayUpdateGen: arr: A cols: C conv: F filling_values: M autostrip: bool loose: bool last_row: int A: type C: type F: type M: type def __init__(self, arr: A, cols: C, conv: F, filling_values: M, autostrip: bool, loose: bool): self.arr = arr self.cols = cols self.conv = conv self.filling_values = filling_values self.autostrip = autostrip self.loose = loose self.last_row = 0 @property def cap(self): arr = self.arr if isinstance(arr, Tuple): return arr[0].size else: return arr.shape[0] def resize_arrays(self, new_cap: int): def resize_one(a, new_cap: int): if a.ndim == 1: data_new = util.realloc(a.data, new_cap, a.size) return ndarray((new_cap, ), data_new) else: rows, cols = a.shape data_new = util.realloc(a.data, new_cap * cols, a.size) return ndarray((new_cap, cols), data_new) arr = self.arr if isinstance(arr, Tuple): self.arr = tuple(resize_one(a, new_cap) for a in arr) else: self.arr = resize_one(arr, new_cap) def trim_arrays(self): if self.last_row < self.cap - 1: self.resize_arrays(self.last_row + 1) def fill_value(self, idx: int, dtype: type) -> dtype: filling_values = self.filling_values if filling_values is None: return default_fill(dtype) elif (isinstance(filling_values, List) or isinstance(filling_values, Tuple) or isinstance(filling_values, ndarray)): return filling_values[idx] elif isinstance(filling_values, Dict): default = default_fill(dtype) if filling_values.K is int: return filling_values.get(idx, default) elif filling_values.K is str: return filling_values.get(self._names[idx], default) else: compile_error("'filling_values' keys must be int or str") else: return filling_values def fill_value_static(self, idx: Static[int], dtype: type) -> dtype: filling_values = self.filling_values if filling_values is None: return default_fill(dtype) elif (isinstance(filling_values, List) or isinstance(filling_values, Tuple) or isinstance(filling_values, ndarray)): return filling_values[idx] elif isinstance(filling_values, Dict): default = default_fill(dtype) if filling_values.K is int: return filling_values.get(idx, default) elif filling_values.K is str: return filling_values.get(self._names[idx], default) else: compile_error("'filling_values' keys must be int or str") else: return filling_values def convert(self, field: str, idx: int, dtype: type) -> dtype: field0 = field conv = self.conv if self.autostrip: field = field.strip() if not field and conv is None: r = self.fill_value(idx, dtype) else: try: if conv is None: r = dtype(field) elif isinstance(conv, Converters): r = conv(field, idx) elif hasattr(conv, '__call__'): r = conv(field) else: r = conv[idx](field) except: if not self.loose: raise ValueError(f"Cannot convert string '{field0}'") r = self.fill_value(idx, dtype) # Make sure we copy strings out of the mmap buffer if isinstance(r, str): return r.__ptrcopy__() else: return r def convert_static(self, field: str, idx: Static[int], dtype: type) -> dtype: field0 = field conv = self.conv if self.autostrip: field = field.strip() if not field and conv is None: r = self.fill_value_static(idx, dtype) else: try: if conv is None: r = dtype(field) elif isinstance(conv, Converters): r = conv(field, idx) elif hasattr(conv, '__call__'): r = conv(field) else: r = conv[idx](field) except: if not self.loose: raise ValueError(f"Cannot convert string '{field0}'") r = self.fill_value_static(idx, dtype) # Make sure we copy strings out of the mmap buffer if isinstance(r, str): return r.__ptrcopy__() else: return r def __call__(self, fields: Ptr[str], num_fields: int, row: int): def get_new_size(size: int, min_grow: int = 512): new_size = size growth = size >> 2 if growth <= min_grow: new_size += min_grow else: new_size += growth + min_grow - 1 new_size &= ~min_grow return new_size arr = self.arr cols = self.cols cap = self.cap if row >= cap: new_cap = get_new_size(cap) self.resize_arrays(new_cap) # Lots of different cases to consider... if cols is not None: if isinstance(arr, Tuple): for i in staticrange(staticlen(cols)): col = cols[i] arr[i].data[row] = self.convert_static( fields[col], i, arr[i].dtype) else: if isinstance(arr.dtype, Tuple): dummy = util.zero(arr.dtype) tup = tuple( self.convert(fields[cols[i]], i, type(dummy[i])) for i in staticrange(staticlen(arr.dtype))) arr._ptr((row, ))[0] = tup else: if arr.ndim == 1: col = cols[0] arr.data[row] = self.convert_static( fields[col], 0, arr.dtype) else: for i in staticrange(staticlen(cols)): col = cols[i] arr._ptr((row, i))[0] = self.convert_static( fields[col], i, arr.dtype) else: if isinstance(arr, Tuple): for i in staticrange(staticlen(arr)): arr[i]._ptr((row, ))[0] = self.convert_static( fields[i], i, arr[i].dtype) elif isinstance(arr.dtype, Tuple): dummy = util.zero(arr.dtype) tup = tuple( self.convert(fields[i], i, type(dummy[i])) for i in staticrange(staticlen(arr.dtype))) arr._ptr((row, ))[0] = tup else: for i in range(num_fields): arr._ptr( (row, i))[0] = self.convert(fields[i].strip(), i, arr.dtype) self.last_row = row def genfromtxt(fname, dtype: type = float, comments: Optional[str] = '#', delimiter=None, skip_header: int = 0, skip_footer: int = 0, converters=None, filling_values=None, usecols=None, names=None, excludelist=None, deletechars: str = " !#$%&'()*+, -./:;<=>?@[\\]^{|}~", replace_space: str = '_', autostrip: bool = False, case_sensitive=True, unpack: Static[int] = False, loose: bool = True, invalid_raise: bool = True, max_rows: Optional[int] = None, ndmin: Static[int] = 0): def make_callback(csv, cols, converters, filling_values, loose: bool, autostrip: bool, num_fields: int, block_size: int, dtype: type, unpack: Static[int] = False): if cols is None: if isinstance(dtype, Tuple): if staticlen(dtype) != num_fields: raise ValueError( "number of fields is different than given tuple 'dtype' length" ) if unpack: dummy = util.zero(dtype) arr = tuple( empty((block_size, ), type(dummy[i])) for i in staticrange(staticlen(dtype))) else: arr = empty((block_size, ), dtype) else: arr = empty((block_size, num_fields), dtype) xcols = cols else: if isinstance(dtype, Tuple): if staticlen(dtype) != staticlen(cols): compile_error( "'usecols' has different length than given tuple 'dtype'" ) xcols = csv.translate_cols(cols, num_fields) if staticlen(xcols) == 1: arr = empty((block_size, ), dtype) else: if unpack: if isinstance(dtype, Tuple): dummy = util.zero(dtype) ncols: Static[int] = staticlen(xcols) arr = tuple( empty((block_size, ), type(dummy[i])) for i in staticrange(ncols)) else: arr = tuple( empty((block_size, ), dtype) for _ in xcols) else: if isinstance(dtype, Tuple): arr = empty((block_size, ), dtype) else: arr = empty((block_size, len(xcols)), dtype) converters = make_conv(converters, num_fields, xcols, dtype) callback = ArrayUpdateGen(arr, cols=xcols, conv=converters, filling_values=filling_values, autostrip=autostrip, loose=loose) return callback def update_names(csv, excludelist, case_sensitive, replace_space: str, deletechars: str): if csv._names: ex = ['return', 'file', 'print'] if excludelist is not None: ex.extend(excludelist) upper = False lower = False BAD_CASE_SENSITIVE: Static[ str] = "'case_sensitive' must be True, False, 'upper' or 'lower'" if isinstance(case_sensitive, bool): if not case_sensitive: upper = True elif isinstance(case_sensitive, str): if case_sensitive == 'upper': upper = True elif case_sensitive == 'lower': lower = True else: raise ValueError(BAD_CASE_SENSITIVE) else: compile_error(BAD_CASE_SENSITIVE) csv.fix_names(excludelist=ex, deletechars=deletechars, replace_space=replace_space, upper=upper, lower=lower) if isinstance(fname, str): if fname.endswith('.gz'): from gzip import open as gz_open with gz_open(fname, 'rb') as f: return genfromtxt(f, dtype=dtype, comments=comments, delimiter=delimiter, skip_header=skip_header, skip_footer=skip_footer, converters=converters, filling_values=filling_values, usecols=usecols, names=names, excludelist=excludelist, deletechars=deletechars, replace_space=replace_space, autostrip=autostrip, case_sensitive=case_sensitive, unpack=unpack, loose=loose, invalid_raise=invalid_raise, max_rows=max_rows, ndmin=ndmin) elif fname.endswith('.bz2'): from bz2 import open as bz_open with bz_open(fname, 'r') as f: return genfromtxt(f, dtype=dtype, comments=comments, delimiter=delimiter, skip_header=skip_header, skip_footer=skip_footer, converters=converters, filling_values=filling_values, usecols=usecols, names=names, excludelist=excludelist, deletechars=deletechars, replace_space=replace_space, autostrip=autostrip, case_sensitive=case_sensitive, unpack=unpack, loose=loose, invalid_raise=invalid_raise, max_rows=max_rows, ndmin=ndmin) if max_rows is not None: if skip_footer: raise ValueError( "The keywords 'skip_footer' and 'max_rows' can not be " "specified at the same time.") if max_rows < 1: raise ValueError("'max_rows' must be at least 1.") if ndmin != 0 and ndmin != 1 and ndmin != 2: compile_error("'ndmin' must be 0, 1 or 2") if delimiter is None: dm = '' dx = 0 spaced = False elif isinstance(delimiter, str): dm = delimiter dx = 0 spaced = False else: dm = '' dx = delimiter spaced = True if isinstance(usecols, int) or isinstance(usecols, str): cols = (usecols, ) else: cols = usecols if cols is not None: if staticlen(cols) == 0: compile_error("cannot pass empty tuple to 'usecols'") if ndmin != 0 and ndmin != 1 and ndmin != 2: compile_error("'ndmin' must be 0, 1 or 2") maxrows: int = max_rows if max_rows is not None else -1 block_size = maxrows if maxrows >= 0 else _DEFAULT_ROWS if names is None: get_names = False given_names = None elif isinstance(names, bool): get_names = names given_names = None elif isinstance(names, str): get_names = False given_names = names.split(',') for i in range(names): names[i] = names[i].strip() elif isinstance(names, List[str]): get_names = False given_names = names else: get_names = False given_names = [a for a in names] if not isinstance(fname, str): # line-by-line mode if skip_footer > 0: raise ValueError( "'skip_footer' not supported in line-by-line mode") csv = CSVReaderGen('', delimiter=dm, comments=comments if comments is not None else '') k = 0 row = 0 first = True num_fields = 0 fields = Ptr[str]() callback = None for line in fname: if k < skip_header or (comments is not None and line.startswith(comments)): k += 1 continue if maxrows >= 0 and row >= maxrows: break if first: if spaced: num_fields = csv.get_num_fields_single_spaced( line, get_names=get_names, delimiter=dx) else: num_fields = csv.get_num_fields_single(line, get_names=get_names) update_names(csv, excludelist=excludelist, case_sensitive=case_sensitive, replace_space=replace_space, deletechars=deletechars) callback = make_callback(csv=csv, cols=cols, converters=converters, filling_values=filling_values, loose=loose, autostrip=autostrip, num_fields=num_fields, block_size=block_size, dtype=dtype, unpack=unpack) fields = Ptr[str](num_fields) first = False if not get_names: if spaced: row += int( csv.parse_single_spaced(line, callback, row, fields, num_fields, invalid_raise, dx)) else: row += int( csv.parse_single(line, callback, row, fields, num_fields, invalid_raise)) else: if spaced: row += int( csv.parse_single_spaced(line, callback, row, fields, num_fields, invalid_raise, dx)) else: row += int( csv.parse_single(line, callback, row, fields, num_fields, invalid_raise)) k += 1 if callback is None: raise ValueError("empty input") callback.trim_arrays() arr = callback.arr if isinstance(arr, ndarray): if unpack: return min_dim(arr, ndmin).T else: return min_dim(arr, ndmin) else: return arr else: with CSVReaderGen( fname, delimiter=dm, comments=comments if comments is not None else '') as csv: if given_names is not None: csv._names = given_names csv.find_length(skip_footer) i = 0 i = csv.skip_lines(i, skip_header) i = csv.skip_comments(i) if spaced: num_fields, i = csv.get_num_fields_spaced(i, get_names=get_names, delimiter=dx) else: num_fields, i = csv.get_num_fields(i, get_names=get_names) update_names(csv, excludelist=excludelist, case_sensitive=case_sensitive, replace_space=replace_space, deletechars=deletechars) callback = make_callback(csv=csv, cols=cols, converters=converters, filling_values=filling_values, loose=loose, autostrip=autostrip, num_fields=num_fields, block_size=block_size, dtype=dtype, unpack=unpack) if spaced: csv.parse_spaced(i, callback, num_fields=num_fields, maxrows=maxrows, invalid_raise=invalid_raise, delimiter=dx) else: csv.parse(i, callback, num_fields=num_fields, maxrows=maxrows, invalid_raise=invalid_raise) callback.trim_arrays() arr = callback.arr if isinstance(arr, ndarray): if unpack: return min_dim(arr, ndmin).T else: return min_dim(arr, ndmin) else: return arr ########## # Pickle # ########## @extend class ndarray: def __pickle__(self, jar: Jar): from pickle import _write_raw atomic = util.atomic(self.dtype) cc, fc = self._contig forder = (fc and not cc) int(forder).__pickle__(jar) for s in self.shape: s.__pickle__(jar) if atomic and (fc or cc): _write_raw(jar, self.data.as_byte(), self.nbytes) else: for idx in util.multirange(self.shape): e = self._ptr(idx)[0] e.__pickle__(jar) def __unpickle__(jar: Jar): from pickle import _read_raw atomic = util.atomic(dtype) forder = bool(int.__unpickle__(jar)) shape = tuple(int.__unpickle__(jar) for _ in staticrange(ndim)) n = util.count(shape) p = Ptr[dtype](n) if atomic: _read_raw(jar, p.as_byte(), n * util.sizeof(dtype)) else: for i in range(n): p[i] = dtype.__unpickle__(jar) return ndarray(shape, p, fcontig=forder)