# Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

import util

newaxis = None

_FLAG_C_CONTIGUOUS   : Static[int] = 0x0001
_FLAG_F_CONTIGUOUS   : Static[int] = 0x0002
_FLAG_OWNDATA        : Static[int] = 0x0004
_FLAG_FORCECAST      : Static[int] = 0x0010
_FLAG_ENSURECOPY     : Static[int] = 0x0020
_FLAG_ENSUREARRAY    : Static[int] = 0x0040
_FLAG_ELEMENTSTRIDES : Static[int] = 0x0080
_FLAG_ALIGNED        : Static[int] = 0x0100
_FLAG_NOTSWAPPED     : Static[int] = 0x0200
_FLAG_WRITEABLE      : Static[int] = 0x0400
_FLAG_WRITEBACKIFCOPY: Static[int] = 0x2000
_FLAG_ENSURENOCOPY   : Static[int] = 0x4000

_FLAG_BEHAVED      : Static[int] = (_FLAG_ALIGNED | _FLAG_WRITEABLE)
_FLAG_BEHAVED_NS   : Static[int] = (_FLAG_ALIGNED | _FLAG_WRITEABLE | _FLAG_NOTSWAPPED)
_FLAG_CARRAY       : Static[int] = (_FLAG_C_CONTIGUOUS | _FLAG_BEHAVED)
_FLAG_CARRAY_RO    : Static[int] = (_FLAG_C_CONTIGUOUS | _FLAG_ALIGNED)
_FLAG_FARRAY       : Static[int] = (_FLAG_F_CONTIGUOUS | _FLAG_BEHAVED)
_FLAG_FARRAY_RO    : Static[int] = (_FLAG_F_CONTIGUOUS | _FLAG_ALIGNED)
_FLAG_DEFAULT      : Static[int] = (_FLAG_CARRAY)
_FLAG_IN_ARRAY     : Static[int] = (_FLAG_CARRAY_RO)
_FLAG_OUT_ARRAY    : Static[int] = (_FLAG_CARRAY)
_FLAG_INOUT_ARRAY  : Static[int] = (_FLAG_CARRAY)
_FLAG_INOUT_ARRAY2 : Static[int] = (_FLAG_CARRAY | _FLAG_WRITEBACKIFCOPY)
_FLAG_IN_FARRAY    : Static[int] = (_FLAG_FARRAY_RO)
_FLAG_OUT_FARRAY   : Static[int] = (_FLAG_FARRAY)
_FLAG_INOUT_FARRAY : Static[int] = (_FLAG_FARRAY)
_FLAG_INOUT_FARRAY2: Static[int] = (_FLAG_FARRAY | _FLAG_WRITEBACKIFCOPY)
_FLAG_UPDATE_ALL   : Static[int] = (_FLAG_C_CONTIGUOUS | _FLAG_F_CONTIGUOUS | _FLAG_ALIGNED)

@tuple
class flagsobj:
    _flags: u32

    def __new__(f: int):
        return flagsobj(u32(f))

    def __new__(ccontig: bool, fcontig: bool):
        f = _FLAG_ALIGNED | _FLAG_WRITEABLE
        if ccontig:
            f |= _FLAG_C_CONTIGUOUS
        if fcontig:
            f |= _FLAG_F_CONTIGUOUS
        return flagsobj(f)

    def _with(self, f: int):
        return flagsobj(self._flags | u32(f))

    def _without(self, f: int):
        return flagsobj(self._flags & ~u32(f))

    def _unown(self):
        return self._without(_FLAG_OWNDATA)

    @property
    def num(self):
        return int(self._flags)

    @property
    def c_contiguous(self):
        return bool(self._flags & u32(_FLAG_C_CONTIGUOUS))

    @property
    def f_contiguous(self):
        return bool(self._flags & u32(_FLAG_F_CONTIGUOUS))

    @property
    def contiguous(self):
        return self.c_contiguous or self.f_contiguous

    @property
    def owndata(self):
        return bool(self._flags & u32(_FLAG_OWNDATA))

    @property
    def writeable(self):
        return bool(self._flags & u32(_FLAG_WRITEABLE))

    @property
    def aligned(self):
        return bool(self._flags & u32(_FLAG_ALIGNED))

    @property
    def writebackifcopy(self):
        return bool(self._flags & u32(_FLAG_WRITEBACKIFCOPY))

    def __str__(self):
        return (f'  C_CONTIGUOUS : {self.c_contiguous}\n'
                f'  F_CONTIGUOUS : {self.f_contiguous}\n'
                f'  OWNDATA : {self.owndata}\n'
                f'  WRITEABLE : {self.writeable}\n'
                f'  ALIGNED : {self.aligned}\n'
                f'  WRITEBACKIFCOPY : {self.writebackifcopy}\n')

class flatiter[A]:
    base: A
    index: int

    def __init__(self, base: A):
        self.base = base
        self.index = 0

    def _index_to_coords(self, index: int):
        return util.index_to_coords(index, self.base.shape)

    @property
    def coords(self):
        shape = self.base.shape

        if staticlen(shape) == 0:
            return ()

        if self.index >= self.base.size:
            return (shape[0],) + (0,) * (staticlen(shape) - 1)
        else:
            return self._index_to_coords(self.index)

    def __iter__(self):
        arr = self.base
        limits = arr.shape
        N: Static[int] = staticlen(limits)
        curr = self.coords
        s = Ptr[int](__ptr__(limits).as_byte())
        p = Ptr[int](__ptr__(curr).as_byte())
        limit = arr.size

        while self.index < limit:
            curr0 = curr
            p[N - 1] += 1

            for i in range(N - 1, -1, -1):
                if p[i] >= s[i]:
                    p[i] = 0
                    p[max(i - 1, 0)] += 1
                else:
                    break

            self.index += 1
            yield arr._ptr(curr0)[0]

    def _fix_index(self, index: int):
        n = self.base.size
        i = index
        if index < 0:
            index += n
        if index < 0 or index >= n:
            raise IndexError(f"index {i} into flatiter is out of bounds for array of size {n}")
        return index

    def __getitem__(self, index: int):
        index = self._fix_index(index)
        coord = self._index_to_coords(index)
        self.index = 0
        return self.base._ptr(coord)[0]

    def __getitem__(self, s: slice):
        base = self.base
        dtype = base.dtype

        start, stop, step, length = s.adjust_indices(self.base.size)
        p = Ptr[dtype](length)
        off = 0

        for i in range(start, stop, step):
            coord = self._index_to_coords(i)
            p[off] = base._ptr(coord)[0]
            off += 1

        self.index = 0
        return A(p)

    def __setitem__(self, index: int, value):
        base = self.base
        dtype = base.dtype
        index = self._fix_index(index)
        coord = self._index_to_coords(index)
        base._ptr(coord)[0] = util.cast(value, dtype)
        self.index = 0

    def __setitem__(self, s: slice, value):
        base = self.base
        dtype = base.dtype

        start, stop, step, _ = s.adjust_indices(self.base.size)
        off = 0

        for i in range(start, stop, step):
            coord = self._index_to_coords(i)
            if hasattr(value, "__getitem__"):
                base._ptr(coord)[0] = util.cast(value[off % len(value)], dtype)
            else:
                base._ptr(coord)[0] = util.cast(value, dtype)
            off += 1

        self.index = 0

    def copy(self):
        return self.base.flatten()

@tuple(init=False)
class _UnaryFunctor:
    op: F
    F: type

    def __new__(op: F, F: type) -> _UnaryFunctor[F]:
        return (op, )

    def __call__(self, y, x):
        y[0] = self.op(x[0])

@tuple(init=False)
class _InplaceUnaryFunctor:
    op: F
    F: type

    def __new__(op: F, F: type) -> _InplaceUnaryFunctor[F]:
        return (op, )

    def __call__(self, x):
        x[0] = self.op(x[0])

@tuple(init=False)
class _BinaryFunctor:
    op: F
    F: type
    R1: type
    R2: type

    def __new__(op: F, R1: type, R2: type, F: type) -> _BinaryFunctor[F, R1, R2]:
        return (op, )

    def __call__(self, z, x, y):
        z[0] = self.op(util.cast(x[0], R1), util.cast(y[0], R2))

@tuple(init=False)
class _InplaceBinaryFunctor:
    op: F
    F: type

    def __new__(op: F, F: type) -> _InplaceBinaryFunctor[F]:
        return (op, )

    def __call__(self, x, y):
        x[0] = self.op(x[0], util.cast(y[0], type(x[0])))

@tuple(init=False)
class _RightBinaryFunctor:
    op: F
    F: type
    R1: type
    R2: type

    def __new__(op: F, R1: type, R2: type, F: type) -> _RightBinaryFunctor[F, R1, R2]:
        return (op, )

    def __call__(self, z, x, y):
        z[0] = self.op(util.cast(y[0], R2), util.cast(x[0], R1))

@tuple(init=False)
class _ScalarFunctor:
    op: F
    y: Y
    F: type
    Y: type
    R1: type
    R2: type

    def __new__(op: F, y: Y, R1: type, R2: type, F: type, Y: type) -> _ScalarFunctor[F, Y, R1, R2]:
        return (op, y)

    def __call__(self, z, x):
        z[0] = self.op(util.cast(x[0], R1), util.cast(self.y, R2))

@tuple(init=False)
class _InplaceScalarFunctor:
    op: F
    y: Y
    F: type
    Y: type

    def __new__(op: F, y: Y, F: type, Y: type) -> _InplaceScalarFunctor[F, Y]:
        return (op, y)

    def __call__(self, x):
        x[0] = self.op(x[0], util.cast(self.y, type(x[0])))


@tuple(init=False)
class _RightScalarFunctor:
    op: F
    y: Y
    F: type
    Y: type
    R1: type
    R2: type

    def __new__(op: F, y: Y, R1: type, R2: type, F: type, Y: type) -> _RightScalarFunctor[F, Y, R1, R2]:
        return (op, y)

    def __call__(self, z, x):
        z[0] = self.op(util.cast(self.y, R2), util.cast(x[0], R1))

@tuple(init=False)
class ndarray[dtype, ndim: Static[int]]:
    _shape: Tuple[ndim, int]
    _strides: Tuple[ndim, int]
    _data: Ptr[dtype]

    def __new__(shape: Tuple[ndim, int],
                strides: Tuple[ndim, int],
                data: Ptr[dtype]) -> ndarray[dtype, ndim]:
        return (shape, strides, data)

    def __new__(shape: Tuple[ndim, int], data: Ptr[dtype], fcontig: bool = False):
        strides = util.strides(shape, fcontig, dtype)
        return ndarray(shape, strides, data)

    @property
    def _contig(self):
        shape = self.shape
        strides = self.strides
        itemsize = self.itemsize
        p_shape = Ptr[int](__ptr__(shape).as_byte())
        p_strides = Ptr[int](__ptr__(strides).as_byte())

        is_c_contig = True
        sd = itemsize
        for i in range(len(shape) - 1, -1, -1):
            dim = p_shape[i]
            if dim == 0:
                return (True, True)
            if dim != 1:
                if p_strides[i] != sd:
                    is_c_contig = False
                sd *= dim

        sd = itemsize
        for i in range(len(shape)):
            dim = p_shape[i]
            if dim != 1:
                if p_strides[i] != sd:
                    return (is_c_contig, False)
                sd *= dim

        return (is_c_contig, True)

    @property
    def _is_contig(self):
        c, f = self._contig
        return c or f

    def _contig_match(self, other):
        if staticlen(self.shape) != staticlen(other.shape):
            return False

        if self.shape != other.shape:
            return False

        c1, f1 = self._contig
        c2, f2 = other._contig
        return (c1 and c2) or (f1 and f2)

    @property
    def shape(self):
        return self._shape

    @property
    def strides(self):
        return self._strides

    @property
    def flags(self):
        return flagsobj(*self._contig)

    @property
    def data(self):
        return self._data

    @property
    def size(self):
        return util.count(self.shape)

    @property
    def itemsize(self):
        return util.sizeof(dtype)

    @property
    def nbytes(self):
        return self.size * self.itemsize

    def item(self, *args):
        n = self.size
        if staticlen(args) == 0:
            if n != 1:
                raise ValueError("can only convert an array of size 1 to a Python scalar")
            return self._data[0]
        if staticlen(args) == 1:
            idx = args[0]
            if idx < -n or idx >= n:
                raise IndexError(f"index {idx} is out of bounds for size {n}")
            if idx < 0:
                idx += n
            coords = util.index_to_coords(idx, self.shape)
            return self._ptr(coords)[0]
        else:
            if staticlen(args) != staticlen(self.shape):
                compile_error("incorrect number of indices for array")
            return self[args]

    def transpose(self, *axes):
        if staticlen(axes) == 0:
            return ndarray(self.shape[::-1], self.strides[::-1], self._data)
        elif staticlen(axes) == 1:
            if isinstance(axes[0], Tuple):
                return self.transpose(*axes[0])
            elif axes[0] is None:
                return self.transpose()
        elif staticlen(axes) != staticlen(self.shape):
            compile_error("axes don't match array")

        axes = tuple(util.normalize_axis_index(ax, self.ndim) for ax in axes)
        if util.has_duplicate(axes):
            raise ValueError("repeated axis in transpose")

        new_shape = tuple(self.shape[j] for j in axes)
        new_strides = tuple(self.strides[j] for j in axes)
        return ndarray(new_shape, new_strides, self._data)

    def transpose(self, axes: List[int]):
        if len(axes) != len(self.shape):
            raise ValueError("axes don't match array")

        if util.has_duplicate(axes):
            raise ValueError("repeated axis in transpose")

        ndim: Static[int] = staticlen(self.shape)
        new_shape = tuple(self.shape[axes[i]] for i in staticrange(ndim))
        new_strides = tuple(self.strides[axes[i]] for i in staticrange(ndim))
        return ndarray(new_shape, new_strides, self._data)

    def swapaxes(self, axis1: int, axis2: int):
        axis1 = util.normalize_axis_index(axis1, self.ndim, 'axis1')
        axis2 = util.normalize_axis_index(axis2, self.ndim, 'axis2')
        new_shape = self.shape
        new_strides = self.strides
        p1 = Ptr[int](__ptr__(new_shape).as_byte())
        p2 = Ptr[int](__ptr__(new_strides).as_byte())
        p1[axis1], p1[axis2] = p1[axis2], p1[axis1]
        p2[axis1], p2[axis2] = p2[axis2], p2[axis1]
        return ndarray(new_shape, new_strides, self._data)

    @property
    def T(self):
        return self.transpose()

    def _fix_unknown_dimension(self, newshape):
        def raise_reshape_size_mismatch(newshape, arr):
            raise ValueError(f"cannot reshape array of size {arr.size} into shape {newshape}")

        s_original = self.size

        if staticlen(newshape) == 0:
            if s_original != 1:
                raise_reshape_size_mismatch(newshape, self)
            return newshape
        elif staticlen(newshape) == 1:
            if newshape[0] < 0:
                return (s_original,)
            elif newshape[0] != s_original:
                raise_reshape_size_mismatch(newshape, self)
            else:
                return newshape
        else:
            dimensions = Ptr[int](__ptr__(newshape).as_byte())
            n = len(newshape)
            s_known = 1
            i_unknown = -1

            for i in range(n):
                dim = dimensions[i]
                if dim < 0:
                    if i_unknown == -1:
                        i_unknown = i
                    else:
                        raise ValueError("can only specify one unknown dimension")
                else:
                    s_known *= dim

            if i_unknown >= 0:
                if s_known == 0 or s_original % s_known != 0:
                    raise_reshape_size_mismatch(newshape, self)
                dimensions[i_unknown] = s_original // s_known
            else:
                if s_original != s_known:
                    raise_reshape_size_mismatch(newshape, self)

            return newshape

    def _attempt_reshape_nocopy(self, newdims, is_f_order: bool):
        shape = self.shape
        strides = self.strides
        oldims = shape
        oldstrides = shape
        newstrides = (0,) * staticlen(newdims)

        p_olddims = Ptr[int](__ptr__(oldims).as_byte())
        p_oldstrides = Ptr[int](__ptr__(oldstrides).as_byte())
        p_newdims = Ptr[int](__ptr__(newdims).as_byte())
        p_newstrides = Ptr[int](__ptr__(newstrides).as_byte())

        oldnd = 0

        for oi in staticrange(self.ndim):
            if shape[oi] != 1:
                p_olddims[oldnd] = shape[oi]
                p_oldstrides[oldnd] = strides[oi]
                oldnd += 1

        oi = 0
        oj = 1
        ni = 0
        nj = 1
        newnd = len(newdims)

        while ni < newnd and oi < oldnd:
            np = p_newdims[ni]
            op = p_olddims[oi]

            while np != op:
                if np < op:
                    np *= p_newdims[nj]
                    nj += 1
                else:
                    op *= p_olddims[oj]
                    oj += 1

            ok = oi
            while ok < oj - 1:
                if is_f_order:
                    if p_oldstrides[ok + 1] != p_olddims[ok] * p_oldstrides[ok]:
                        return False, newdims
                else:
                    if p_oldstrides[ok] != p_olddims[ok + 1] * p_oldstrides[ok + 1]:
                        return False, newdims
                ok += 1

            if is_f_order:
                p_newstrides[ni] = p_oldstrides[oi]
                nk = ni + 1
                while nk < nj:
                    p_newstrides[nk] = p_newstrides[nk - 1] * p_newdims[nk - 1]
                    nk += 1
            else:
                p_newstrides[nj - 1] = p_oldstrides[oj - 1]
                nk = nj - 1
                while nk > ni:
                    p_newstrides[nk - 1] = p_newstrides[nk] * p_newdims[nk]
                    nk -= 1

            ni = nj
            nj += 1
            oi = oj
            oj += 1

        last_stride = 0
        if ni >= 1:
            last_stride = p_newstrides[ni - 1]
        else:
            last_stride = self.itemsize

        if is_f_order:
            last_stride *= p_newdims[ni - 1]

        nk = ni
        while nk < newnd:
            p_newstrides[nk] = last_stride
            nk += 1

        return True, newstrides

    def reshape(self, *shape, order: str = 'C'):
        ndarray._check_order(order)

        if staticlen(shape) == 0:
            a = self.size
            if a != 1:
                raise ValueError(f'cannot reshape array of size {a} into shape ()')
            return ndarray((), (), self.data)

        if staticlen(shape) == 1 and isinstance(shape[0], Tuple):
            return self.reshape(*shape[0])

        ccontig, fcontig = self._contig

        if order == 'A':
            order = 'F' if (fcontig and not ccontig) else 'C'
        elif order == 'K':
            raise ValueError("order 'K' is not permitted for reshaping")

        if staticlen(shape) == staticlen(self.shape):
            if shape == self.shape:
                return self

        shape = self._fix_unknown_dimension(shape)

        if (order == 'C' and not ccontig) or (order == 'F' and not fcontig):
            success, newstrides = self._attempt_reshape_nocopy(shape, (order == 'F'))
            if success:
                return ndarray(shape, newstrides, self._data)
            else:
                self = self.copy(order=order)

        return ndarray(shape, self._data, fcontig=(order == 'F'))

    def _loop(arrays, func, broadcast: Static[str] = 'all',
              check: Static[int] = True, alloc: type = type(()),
              optimize_order: Static[int] = True, extra = None):
        def call(func, args, extra):
            if extra is None:
                return func(*args)
            else:
                return func(*args, extra)

        def loop(shape, strides, ptrs, func, extra):
            def incr_ptr(p: Ptr[T], s: int, T: type):
                return Ptr[T](p.as_byte() + s)

            if staticlen(shape) == 0:
                call(func, ptrs, extra)
            elif staticlen(shape) == 1:
                n = shape[0]

                # Common cases are:
                #   - len(ptrs) == 1 ; i.e. in-place unary operation
                #   - len(ptrs) == 2 ; i.e. unary or in-place binary operation
                #   - len(ptrs) == 3 ; i.e. binary operation
                # We handle these specially so as to ensure auto-vectorization.

                if staticlen(ptrs) == 2:
                    s0 = strides[0][0]
                    s1 = strides[1][0]
                    e0 = util.sizeof(type(ptrs[0][0]))
                    e1 = util.sizeof(type(ptrs[1][0]))

                    if s0 == e0 and s1 == e1:
                        for i in range(n):
                            call(func, (ptrs[0] + i, ptrs[1] + i), extra)
                    elif s0 == e0 and s1 == 0:
                        for i in range(n):
                            p0 = incr_ptr(ptrs[0], i * s0)
                            p1 = ptrs[1]
                            call(func, (p0, p1), extra)
                    else:
                        for i in range(n):
                            p0 = incr_ptr(ptrs[0], i * s0)
                            p1 = incr_ptr(ptrs[1], i * s1)
                            call(func, (p0, p1), extra)
                elif staticlen(ptrs) == 3:
                    s0 = strides[0][0]
                    s1 = strides[1][0]
                    s2 = strides[2][0]
                    e0 = util.sizeof(type(ptrs[0][0]))
                    e1 = util.sizeof(type(ptrs[1][0]))
                    e2 = util.sizeof(type(ptrs[2][0]))

                    if s0 == e0 and s1 == e1 and s2 == e2:
                        for i in range(n):
                            call(func, (ptrs[0] + i, ptrs[1] + i, ptrs[2] + i), extra)
                    elif s0 == e0 and s1 == 0 and s2 == e2:
                        for i in range(n):
                            p0 = incr_ptr(ptrs[0], i * s0)
                            p1 = ptrs[1]
                            p2 = incr_ptr(ptrs[2], i * s2)
                            call(func, (p0, p1, p2), extra)
                    elif s0 == e0 and s1 == e1 and s2 == 0:
                        for i in range(n):
                            p0 = incr_ptr(ptrs[0], i * s0)
                            p1 = incr_ptr(ptrs[1], i * s1)
                            p2 = ptrs[2]
                            call(func, (p0, p1, p2), extra)
                    else:
                        for i in range(n):
                            p0 = incr_ptr(ptrs[0], i * s0)
                            p1 = incr_ptr(ptrs[1], i * s1)
                            p2 = incr_ptr(ptrs[2], i * s2)
                            call(func, (p0, p1, p2), extra)
                else:
                    for i in range(shape[0]):
                        ptrs_i = tuple(incr_ptr(ptrs[j], i * strides[j][0])
                                       for j in staticrange(staticlen(ptrs)))
                        call(func, ptrs_i, extra)
            else:
                shape1 = shape[1:]
                strides1 = tuple(x[1:] for x in strides)
                for _ in range(shape[0]):
                    loop(shape1, strides1, ptrs, func, extra)
                    ptrs = tuple(incr_ptr(ptrs[i], strides[i][0])
                                 for i in staticrange(staticlen(ptrs)))

        def reorder_loops(strides):
            if staticlen(strides) == 0:
                return ()

            if staticlen(strides) == 1:
                return (0,)

            if staticlen(strides) == 2:
                s0 = strides[0]
                s1 = strides[1]
                if s0 and abs(s0) < abs(s1):
                    return (1, 0)
                return (0, 1)

            perm = util.tuple_range(staticlen(strides))
            perm, _ = util.sort_by_stride(perm, strides)
            return perm

        def broadcast_shapes(args, check: Static[int]):
            def largest(args):
                if staticlen(args) == 1:
                    return args[0]

                a = args[0]
                b = largest(args[1:])
                if staticlen(b) > staticlen(a):
                    return b
                else:
                    return a

            if staticlen(args) == 0:
                return ()

            t = largest(args)
            N: Static[int] = staticlen(t)
            ans = (0,) * N
            p = Ptr[int](__ptr__(ans).as_byte())

            for i in staticrange(N):
                p[i] = t[i]

            for a in args:
                for i in staticrange(staticlen(a)):
                    x = a[len(a) - 1 - i]
                    q = p + (len(t) - 1 - i)
                    y = q[0]

                    if y == 1:
                        q[0] = x
                    elif check and x != 1 and x != y:
                        raise ValueError('shape mismatch: objects cannot be broadcast to a single shape')

            return ans

        def broadcast_to(x, shape, check: Static[int]):
            N: Static[int] = x.ndim
            substrides = (0,) * N
            p = Ptr[int](__ptr__(substrides).as_byte())
            shape1, shape2 = shape[:-N], shape[-N:]

            for i in staticrange(N):
                a = x.shape[i]
                b = shape2[i]
                if a == b:
                    p[i] = x.strides[i]
                else:
                    if check:
                        if a != 1:
                            raise ValueError(f'cannot broadcast array of shape {x.shape} to shape {shape}')
                    p[i] = 0

            z = (0,) * (staticlen(shape) - x.ndim)
            new_strides = (*z, *substrides)
            return ndarray(shape, new_strides, x.data)

        def broadcast_arrays(arrays, check: Static[int]):
            shape = broadcast_shapes(tuple(arr.shape for arr in arrays), check=check)
            return tuple(broadcast_to(arr, shape, check=False) for arr in arrays)

        def min_dim(arrays):
            if staticlen(arrays) == 0:
                compile_error("[internal error] arrays empty")
            elif staticlen(arrays) == 1:
                return arrays[0]
            else:
                arrays0 = arrays[0]
                arrays1 = min_dim(arrays[1:])
                if arrays1.ndim < arrays0.ndim:
                    return arrays1
                else:
                    return arrays0

        def max_dim(arrays):
            if staticlen(arrays) == 0:
                compile_error("[internal error] arrays empty")
            elif staticlen(arrays) == 1:
                return arrays[0]
            else:
                arrays0 = arrays[0]
                arrays1 = max_dim(arrays[1:])
                if arrays1.ndim > arrays0.ndim:
                    return arrays1
                else:
                    return arrays0

        def all_contiguous(arrays):
            min_arr = min_dim(arrays)
            max_arr = max_dim(arrays)
            if min_arr.ndim == max_arr.ndim:
                sh = True
                cc = True
                fc = True

                for i in staticrange(staticlen(arrays)):
                    arr = arrays[i]
                    if i > 0:
                        sh = sh and (arr.shape == arrays[0].shape)
                    cc1, fc1 = arr._contig
                    cc = cc and cc1
                    fc = fc and fc1

                return sh and (cc or fc), cc
            else:
                return False, False

        def alloc_array(count, perm_shape, dtype: type):
            p = Ptr[dtype](count)
            strides = ndarray(perm_shape, p).strides
            return (p, strides)

        def broadcast_args(arrays, broadcast: Static[str], check: Static[int]):
            if broadcast == 'none':
                shape = arrays[0].shape
                strides = tuple(arr.strides for arr in arrays)
            elif broadcast == 'first':
                shape = arrays[0].shape
                arrays1 = arrays[:1] + tuple(broadcast_to(arr, shape, check=check) for arr in arrays[1:])
                strides = tuple(arr.strides for arr in arrays1)
            elif broadcast == 'all':
                arrays1 = broadcast_arrays(arrays, check=check)
                shape = arrays1[0].shape
                strides = tuple(arr.strides for arr in arrays1)
            else:
                compile_error("'broadcast' argument must be 'none', 'first' or 'all'")

            return shape, strides

        if staticlen(arrays) == 0:
            return

        all_contig, ccontig = all_contiguous(arrays)
        min_arr = min_dim(arrays)
        max_arr = max_dim(arrays)

        if min_arr.ndim == max_arr.ndim:
            if all_contig:
                shape = arrays[0].shape
                strides = tuple(arr.strides for arr in arrays)
            else:
                shape, strides = broadcast_args(arrays, broadcast, check)
        else:
            shape, strides = broadcast_args(arrays, broadcast, check)

        alloc_tuple = util.zero(alloc)

        if staticlen(alloc_tuple) > 0:
            if optimize_order:
                perm0 = reorder_loops(max_arr.strides)
            else:
                perm0 = None
            count = util.count(shape)
            perm_shape = util.tuple_perm(shape, perm0)
            # `allocated` is a tuple of (ptr, strides) pairs
            allocated = tuple(alloc_array(count, perm_shape, type(alloc_tuple[i]))
                              for i in staticrange(staticlen(alloc_tuple)))
        else:
            perm0 = None
            allocated = ()

        if all_contig:
            for i in range(arrays[0].size):
                call(func,
                     tuple(tup[0] + i for tup in allocated) + tuple(arr.data + i for arr in arrays),
                     extra)
            return tuple(ndarray(shape, tup[0], fcontig=(not ccontig)) for tup in allocated)

        shape0 = shape

        if optimize_order:
            if perm0 is None:
                perm = reorder_loops(max_arr.strides)
            else:
                perm = perm0
            shape = util.tuple_perm(shape, perm)
            strides = tuple(util.tuple_perm(s, perm) for s in strides)
        else:
            perm = None

        loop(shape,
             tuple(tup[1] for tup in allocated) + strides,
             tuple(tup[0] for tup in allocated) + tuple(arr.data for arr in arrays),
             func,
             extra)

        if perm is not None and staticlen(allocated) > 0 and staticlen(shape) >= 2:
            # permute the strides
            if staticlen(shape) == 2:
                rev = (perm[0] == 1)
                return tuple(ndarray(shape0, tup[1][::-1] if rev else tup[1], tup[0]) for tup in allocated)
            else:
                iperm = util.tuple_perm_inv(perm)
                return tuple(ndarray(shape0, util.tuple_perm(tup[1], iperm), tup[0]) for tup in allocated)
        else:
            return tuple(ndarray(shape0, tup[1], tup[0]) for tup in allocated)

    def _contiguous(self, copy: Static[int] = False):
        ccontig, _ = self._contig
        if ccontig:
            if copy:
                n = self.size
                p = Ptr[dtype](n)
                str.memcpy(p.as_byte(), self._data.as_byte(), n * self.itemsize)
                return p
            else:
                return self._data
        else:
            n = self.size
            p = Ptr[dtype](n)
            i = 0
            for idx in util.multirange(self.shape):
                q = self._ptr(idx)
                p[i] = q[0]
                i += 1
            return p

    def _fcontiguous(self, copy: Static[int] = False):
        _, fcontig = self._contig
        if fcontig:
            if copy:
                n = self.size
                p = Ptr[dtype](n)
                str.memcpy(p.as_byte(), self._data.as_byte(), n * self.itemsize)
                return p
            else:
                return self._data
        else:
            n = self.size
            p = Ptr[dtype](n)
            i = 0
            for idx in util.fmultirange(self.shape):
                q = self._ptr(idx)
                p[i] = q[0]
                i += 1
            return p

    def tobytes(self, order: str = 'C'):
        ndarray._check_order(order)
        ccontig, fcontig = self._contig

        if order == 'A':
            order = 'F' if fcontig and not ccontig else 'C'

        n = self.size
        p = Ptr[dtype](n)

        if (order == 'C' and ccontig) or (order == 'F' and fcontig):
            str.memcpy(p.as_byte(), self._data.as_byte(), n * self.itemsize)
        elif order == 'F':
            i = 0
            for idx in util.fmultirange(self.shape):
                p[i] = self._ptr(idx)[0]
                i += 1
        else:
            i = 0
            for idx in util.multirange(self.shape):
                p[i] = self._ptr(idx)[0]
                i += 1

        return str(p.as_byte(), n * self.itemsize)

    def ravel(self, order: str = 'C'):
        ndarray._check_order(order)
        ccontig, fcontig = self._contig

        if order == 'A':
            order = 'F' if fcontig else 'C'

        if order == 'C':
            if ccontig:
                return ndarray((self.size,), self._data)
            else:
                return ndarray((self.size,), self._contiguous())
        elif order == 'F':
            if fcontig:
                return ndarray((self.size,), self._data)
            else:
                return ndarray((self.size,), self._fcontiguous())
        else:
            shape_sorted, strides_sorted = util.sort_by_stride(self.shape, self.strides)
            other = ndarray(shape_sorted, strides_sorted, self._data)
            return other.flatten()

    def flatten(self, order: str = 'C'):
        ndarray._check_order(order)
        ccontig, fcontig = self._contig

        if order == 'A':
            order = 'F' if fcontig else 'C'

        if order == 'C':
            return ndarray((self.size,), self._contiguous(copy=True))
        elif order == 'F':
            return ndarray((self.size,), self._fcontiguous(copy=True))
        else:
            shape_sorted, strides_sorted = util.sort_by_stride(self.shape, self.strides)
            other = ndarray(shape_sorted, strides_sorted, self._data)
            return other.flatten()

    @property
    def flat(self):
        return flatiter(self)

    @flat.setter
    def flat(self, value):
        self.flat[:] = value

    def tolist(self):
        if staticlen(self.shape) == 0:
            return List[dtype]()
        elif staticlen(self.shape) == 1:
            return [a for a in self]
        else:
            return [a.tolist() for a in self]

    def _ptr_for_index(self, indexes, check: Static[int] = True, broadcast: Static[int] = False):
        s = self.shape
        strides = self.strides
        pshape = Ptr[int](__ptr__(s).as_byte())
        pindex = Ptr[int](__ptr__(indexes).as_byte())
        pstride = Ptr[int](__ptr__(strides).as_byte())

        offset = 0
        for i in range(len(indexes)):
            idx = pindex[i]

            if staticlen(indexes) > staticlen(self.shape):
                if not broadcast:
                    compile_error("[internal error] index tuple too long")
                i -= staticlen(indexes) - staticlen(self.shape)
                if i < 0:
                    continue

            n = pshape[i]
            if broadcast:
                if n == 1:
                    continue
            if check:
                idx = util.normalize_index(idx, i, n)
            offset += idx * pstride[i]

        return Ptr[dtype](self._data.as_byte() + offset)

    def _ptr(self, indexes, broadcast: Static[int] = False):
        return self._ptr_for_index(indexes, check=False, broadcast=broadcast)

    def __len__(self):
        if staticlen(self.shape) == 0:
            compile_error("len() of unsized object")
        return self.shape[0]

    def __iter__(self):
        for i in range(self.shape[0]):
            yield self[i]

    def _check_order(order: str):
        if order not in ('C', 'F', 'A', 'K'):
            raise ValueError(f"order must be one of 'C', 'F', 'A', or 'K' (got {repr(order)})")

    def astype(self, dtype: type, order: str = 'K', copy: bool = True):
        ndarray._check_order(order)
        cc, fc = self._contig

        if dtype is self.dtype:
            x = self
            if copy or (order == 'C' and not cc) or (order == 'F' and not fc):
                a = self._data
                n = self.size
                b = Ptr[dtype](n)

                if ((order == 'C' and cc) or (order == 'F' and fc)):
                    f = fc and not cc
                    str.memcpy(b.as_byte(), a.as_byte(), n * self.itemsize)
                    x = ndarray(self.shape, b, fcontig=f)
                else:
                    f = False
                    if order == 'F':
                        f = True
                    elif order == 'A' or order == 'K':
                        f = fc

                    x = ndarray(self.shape, b, fcontig=f)
                    for idx in util.multirange(self.shape):
                        p = self._ptr(idx)
                        q = x._ptr(idx)
                        q[0] = p[0]
            return x

        a = self._data
        n = self.size
        b = Ptr[dtype](n)

        f = False
        if order == 'F':
            f = True
        elif order == 'A' or order == 'K':
            f = fc

        other = ndarray(self.shape, b, fcontig=f)
        for idx in util.multirange(self.shape):
            p = self._ptr(idx)
            q = other._ptr(idx)
            q[0] = util.cast(p[0], dtype)
        return other

    def copy(self, order: str = 'C'):
        return self.astype(dtype=dtype, order=order, copy=True)

    def __copy__(self):
        return self.copy()

    def _should_transpose(self, other = None):
        if other is None:
            if self.ndim > 1:
                s1 = self.strides[0]
                s2 = self.strides[-1]
                return s1 and abs(s1) < abs(s2)
            else:
                return False
        else:
            if self.ndim > 1 and other.ndim > 1:
                sa1 = self.strides[0]
                sa2 = self.strides[-1]
                sb1 = other.strides[0]
                sb2 = other.strides[-1]
                return sa1 and sb1 and abs(sa1) < abs(sa2) and abs(sb1) < abs(sb2)
            elif self.ndim > 1:
                s1 = self.strides[0]
                s2 = self.strides[-1]
                return s1 and abs(s1) < abs(s2)
            elif other.ndim > 1:
                s1 = other.strides[0]
                s2 = other.strides[-1]
                return s1 and abs(s1) < abs(s2)
            else:
                return False

    def _normalize(self, other: ndarray):
        if self.ndim > other.ndim:
            diff: Static[int] = self.ndim - other.ndim
            A = self
            B = ndarray((1,) * diff + other.shape, (0,) * diff + other.strides, other.data)
        elif self.ndim < other.ndim:
            diff: Static[int] = other.ndim - self.ndim
            A = ndarray((1,) * diff + self.shape, (0,) * diff + self.strides, self.data)
            B = other
        else:
            A = self
            B = other

        return A, B

    def _op_elemwise(self, other: ndarray, op):
        dtype1 = self.dtype
        dtype2 = other.dtype
        r1, r2 = util.op_types(dtype1, dtype2)
        R1 = type(r1)
        R2 = type(r2)
        T = type(op(util.cast(self.data[0], R1),
                    util.cast(other.data[0], R2)))
        return ndarray._loop((self, other),
                             _BinaryFunctor(op=op, R1=R1, R2=R2),
                             alloc=Tuple[T])[0]

    def _rop_elemwise(self, other: ndarray, op):
        dtype1 = self.dtype
        dtype2 = other.dtype
        r1, r2 = util.op_types(dtype1, dtype2)
        R1 = type(r1)
        R2 = type(r2)
        T = type(op(util.cast(self.data[0], R1),
                    util.cast(other.data[0], R2)))
        return ndarray._loop((self, other),
                             _RightBinaryFunctor(op=op, R1=R1, R2=R2),
                             alloc=Tuple[T])[0]

    def _op_scalar(self, b, op):
        dtype1 = self.dtype
        dtype2 = type(b)
        r1, r2 = util.op_types(dtype1, dtype2)
        R1 = type(r1)
        R2 = type(r2)
        T = type(op(util.cast(self.data[0], R1),
                    util.cast(b, R2)))
        return ndarray._loop((self,),
                             _ScalarFunctor(op=op, y=b, R1=R1, R2=R2),
                             alloc=Tuple[T])[0]

    def _iop_elemwise(self, other: ndarray, op):
        ndarray._loop((self, other), _InplaceBinaryFunctor(op))
        return self

    def _iop_scalar(self, b, op):
        ndarray._loop((self,), _InplaceScalarFunctor(op=op, y=b))
        return self

    def _rop_scalar(self, b, op):
        dtype1 = self.dtype
        dtype2 = type(b)
        r1, r2 = util.op_types(dtype1, dtype2)
        R1 = type(r1)
        R2 = type(r2)
        T = type(op(util.cast(self.data[0], R1),
                    util.cast(b, R2)))
        return ndarray._loop((self,),
                             _RightScalarFunctor(op=op, y=b, R1=R1, R2=R2),
                             alloc=Tuple[T])[0]

    def _op_unary(self, op):
        T = type(op(self.data[0]))
        return ndarray._loop((self,), _UnaryFunctor(op), alloc=Tuple[T])[0]

    def _iop_unary(self, op):
        ndarray._loop((self,), _InplaceUnaryFunctor(op))
        return self

    def _any(self, cond):
        n = self.size
        a = self._data

        if self._is_contig:
            for i in range(n):
                if cond(a[i]):
                    return True
        else:
            A = self.T if self._should_transpose() else self
            for idx in util.multirange(A.shape):
                if cond(A._ptr(idx)[0]):
                    return True

        return False

    def _all(self, cond):
        n = self.size
        a = self._data

        if self._is_contig:
            for i in range(n):
                if not cond(a[i]):
                    return False
        else:
            A = self.T if self._should_transpose() else self
            for idx in util.multirange(A.shape):
                if not cond(A._ptr(idx)[0]):
                    return False

        return False

    def _minmax(self):
        n = self.size
        a = self._data

        if n == 0:
            return util.zero(dtype), util.zero(dtype)

        M = a[0]
        m = a[0]

        if self._is_contig:
            for i in range(1, n):
                e = a[i]
                if e > M:
                    M = e
                if e < m:
                    m = e
        else:
            A = self.T if self._should_transpose() else self
            for idx in util.multirange(A.shape):
                e = A._ptr(idx)[0]
                if e > M:
                    M = e
                if e < m:
                    m = e

        return m, M

    def map(self, fn, inplace: Static[int] = False):
        if inplace:
            return self._iop_unary(fn)
        else:
            return self._op_unary(fn)

    def fill(self, value):
        value = util.cast(value, dtype)
        self.map(lambda x: value, inplace=True)

    def _size1_error():
        raise ValueError("only size-1 arrays can be converted to scalars")

    def __int__(self):
        if self.size != 1:
            ndarray._size1_error()
        return int(self._data[0])

    def __float__(self):
        if self.size != 1:
            ndarray._size1_error()
        return float(self._data[0])

    def __complex__(self):
        if self.size != 1:
            ndarray._size1_error()
        return complex(self._data[0])

    def __bool__(self):
        if self.size != 1:
            raise ValueError("The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()")
        return bool(self._data[0])

    def view(self, dtype: type):
        my_size = self.itemsize
        dt_size = util.sizeof(dtype)
        new_shape = self.shape
        new_strides = self.strides
        new_data = Ptr[dtype](self.data.as_byte())

        if my_size != dt_size:
            if self.ndim == 0:
                raise ValueError("Changing the dtype of a 0d array is only supported if the itemsize is unchanged")
            elif self.shape[-1] != 1 and self.size != 0 and self.strides[-1] != self.itemsize:
                raise ValueError("To change to a dtype of a different size, the last axis must be contiguous")
            elif my_size > dt_size:
                if dt_size == 0 or my_size % dt_size != 0:
                    raise ValueError("When changing to a smaller dtype, its size must be a divisor of the size of original dtype")
                newdim = my_size // dt_size
                new_shape = new_shape[:-1] + (new_shape[-1] * newdim,)
                new_strides = new_strides[:-1] + (dt_size,)
            elif my_size < dt_size:
                newdim = self.shape[-1] * my_size
                if newdim % dt_size != 0:
                    raise ValueError("When changing to a larger dtype, its size must be a "
                                     "divisor of the total size in bytes of the last axis "
                                     "of the array.")
                new_shape = new_shape[:-1] + (newdim // dt_size,)
                new_strides = new_strides[:-1] + (dt_size,)

        return ndarray(new_shape, new_strides, new_data)

    def byteswap(self, inplace: bool = False):
        def bswap(x: T, T: type):
            if T is int or T is byte or isinstance(T, Int) or isinstance(T, UInt):
                return util.bswap(x)

            if T is float:
                return util.bitcast(util.bswap(util.bitcast(x, u64)), float)

            if T is float32:
                return util.bitcast(util.bswap(util.bitcast(x, u32)), float32)

            if T is complex or T is complex64:
                return T(bswap(x.real), bswap(x.imag))

            if not util.atomic(T):
                return x

            y = x
            p = __ptr__(y).as_byte()
            n = util.sizeof(T)
            q = p + (n - 1)

            while p < q:
                p[0], q[0] = q[0], p[0]
                p += 1
                q -= 1

            return y

        if inplace:
            return self.map(bswap, inplace=True)
        else:
            return self.map(bswap, inplace=False)

    def _ptr_flat(self, idx: int, check: Static[int]):
        if check:
            n = self.size
            if idx < -n or idx >= n:
                raise IndexError(f"index {idx} is out of bounds for size {n}")
            if idx < 0:
                idx += n
        return self._ptr(util.index_to_coords(idx, self.shape))

    def _get_flat(self, idx: int, check: Static[int]):
        return self._ptr_flat(idx, check=check)[0]

    def _set_flat(self, idx: int, val, check: Static[int]):
        self._ptr_flat(idx, check=check)[0] = util.cast(val, dtype)