import gpu

@test
def test_hello_world():
    @gpu.kernel
    def kernel(a, b, c):
        i = gpu.thread.x
        c[i] = a[i] + b[i]

    a = [i for i in range(16)]
    b = [2*i for i in range(16)]
    c = [0 for _ in range(16)]
    kernel(a, b, c, grid=1, block=16)

    assert c == [3*i for i in range(16)]

@test
def test_raw():
    @gpu.kernel
    def kernel(a, b, c):
        i = gpu.thread.x
        c[i] = a[i] + b[i]

    a = [i for i in range(16)]
    b = [2*i for i in range(16)]
    c = [0 for _ in range(16)]
    kernel(gpu.raw(a), gpu.raw(b), gpu.raw(c), grid=1, block=16)

    assert c == [3*i for i in range(16)]

@test
def test_conversions():
    @gpu.kernel
    def kernel(x, v):
        v[0] = x

    def check(x):
        T = type(x)
        v = [T()]
        kernel(x, v, grid=1, block=1)
        return v == [x]

    assert check(None)
    assert check(42)
    assert check(3.14)
    assert check(f32(2.718))
    assert check(byte(99))
    assert check(Int[128](123123))
    assert check(UInt[128](321321))
    assert check(Optional[int]())
    assert check(Optional(111))
    assert check((1, 2, 3))
    assert check(([1], [2], [3]))
    # assert check(())  # TODO: PTX can't handle this; why?
    assert check(DynamicTuple((1, 2, 3)))
    assert check(DynamicTuple(([1], [2], [3])))
    assert check(DynamicTuple[int]())
    assert check(DynamicTuple[List[List[List[str]]]]())
    assert check('hello world')
    assert check([1, 2, 3])
    assert check([[1], [2], [3]])
    assert check({1: [1.1], 2: [2.2]})
    assert check({'a', 'b', 'c'})
    assert check(Optional([1, 2, 3]))

@test
def test_user_classes():
    @dataclass(gpu=True, eq=True)
    class A:
       x: int
       y: List[int]

    @tuple
    class B:
        x: int
        y: List[int]

    @gpu.kernel
    def kernel(a, b, c):
        a.x += b.x + c[0]
        c[1][0][0] = 9999
        a.y[0] = c[0] + 1
        b.y[0] = c[0] + 2

    a = A(42, [-1])
    b = B(100, [-2])
    c = (1000, [[-1]])
    kernel(a, b, c, grid=1, block=1)

    assert a == A(1142, [1001])
    assert b == B(100, [1002])
    assert c == (1000, [[9999]])

    @gpu.kernel
    def kernel2(a, b, c):
        a[0].x += b[0].x + c[0][0]
        c[0][1][0][0] = 9999
        a[0].y[0] = c[0][0] + 1
        b[0].y[0] = c[0][0] + 2

    a = [A(42, [-1])]
    b = [B(100, [-2])]
    c = [(1000, [[-1]])]
    kernel2(a, b, c, grid=1, block=1)

    assert a == [A(1142, [1001])]
    assert b == [B(100, [1002])]
    assert c == [(1000, [[9999]])]

@test
def test_intrinsics():
    @gpu.kernel
    def kernel(v):
        block_id = (gpu.block.x + gpu.block.y*gpu.grid.dim.x +
                    gpu.block.z*gpu.grid.dim.x*gpu.grid.dim.y)
        thread_id = (block_id*gpu.block.dim.x*gpu.block.dim.y*gpu.block.dim.z +
                     gpu.thread.z*gpu.block.dim.x*gpu.block.dim.y +
                     gpu.thread.y*gpu.block.dim.x +
                     gpu.thread.x)
        v[thread_id] = thread_id
        gpu.syncthreads()

    grid = gpu.Dim3(3, 4, 5)
    block = gpu.Dim3(6, 7, 8)
    N = grid.x * grid.y * grid.z * block.x * block.y * block.z
    v = [0 for _ in range(N)]
    kernel(v, grid=grid, block=block)
    assert v == list(range(N))

@test
def test_matmul():
    A = [[12, 7, 3],
         [4, 5, 6],
         [7, 8, 9]]

    B = [[5, 8, 1, 2],
         [6, 7, 3, 0],
         [4, 5, 9, 1]]

    def mmz(A, B):
        return [[0]*len(B[0]) for _ in range(len(A))]

    def matmul(A, B):
        result = mmz(A, B)
        for i in range(len(A)):
            for j in range(len(B[0])):
                for k in range(len(B)):
                    result[i][j] += A[i][k] * B[k][j]
        return result

    expected = matmul(A, B)

    @gpu.kernel
    def kernel(A, B, result):
        i = gpu.thread.x
        j = gpu.thread.y
        result[i][j] = sum(A[i][k]*B[k][j] for k in range(len(A[0])))

    result = mmz(A, B)
    kernel(A, B, result, grid=1, block=(len(result), len(result[0])))
    assert result == expected

MAX    = 1000  # maximum Mandelbrot iterations
N      = 256   # width and height of image

@test
def test_mandelbrot():
    pixels = [0 for _ in range(N * N)]

    def scale(x, a, b):
        return a + (x/N)*(b - a)

    expected = [0 for _ in range(N * N)]
    for i in range(N):
        for j in range(N):
            c = complex(scale(j, -2.00, 0.47), scale(i, -1.12, 1.12))
            z = 0j
            iteration = 0

            while abs(z) <= 2 and iteration < MAX:
                z = z**2 + c
                iteration += 1

            expected[N*i + j] = int(255 * iteration/MAX)

    @gpu.kernel
    def kernel(pixels):
        idx = (gpu.block.x * gpu.block.dim.x) + gpu.thread.x
        i, j = divmod(idx, N)
        c = complex(scale(j, -2.00, 0.47), scale(i, -1.12, 1.12))
        z = 0j
        iteration = 0

        while abs(z) <= 2 and iteration < MAX:
            z = z**2 + c
            iteration += 1

        pixels[idx] = int(255 * iteration/MAX)

    kernel(pixels, grid=(N*N)//1024, block=1024)
    assert pixels == expected

@test
def test_kitchen_sink():
    @gpu.kernel
    def kernel(x):
        i = gpu.thread.x
        d = {1: 2.1, 2: 3.5, 3: 4.2}
        s = {4, 5, 6}
        z = sum(
            d.get(x[i], j) + (j if i in s else -j)
            for j in range(i)
        )
        x[i] = int(z)

    x = [i for i in range(16)]
    kernel(x, grid=1, block=16)
    assert x == [0, 2, 6, 9, 12, 20, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0]

@test
def test_auto_par():
    a = [i for i in range(16)]
    b = [2*i for i in range(16)]
    c = [0 for _ in range(16)]

    @par(gpu=True)
    for i in range(16):
        c[i] = a[i] + b[i]

    assert c == [3*i for i in range(16)]

    @par(gpu=True)
    for i in range(16):
        c[i] += a[i] + b[i]

    assert c == [6*i for i in range(16)]

    N = 200
    Z = 42
    x = [0] * (N*N)
    y = [0] * (N*N)

    for i in range(2, N - 1, 3):
        for j in range(3, N, 2):
            x[i*N + j] = i + j + Z

    @par(gpu=True, collapse=2)
    for i in range(2, N - 1, 3):
        for j in range(3, N, 2):
            y[i*N + j] = i + j + Z

    assert x == y

    @par(gpu=True)
    for i in range(1):
        pass

test_hello_world()
test_raw()
test_conversions()
test_user_classes()
test_intrinsics()
test_matmul()
test_mandelbrot()
test_kitchen_sink()
test_auto_par()