codon build command: add --cir output type option (#649 )

Update OpenBLAS
Support CMake 4.0
2025-04-22 11:46:03 -04:00 · 2025-04-04 14:59:13 -04:00 · 2025-04-04 11:27:35 -04:00 · 2025-04-03 10:41:19 -04:00 · 2025-04-03 10:39:45 -04:00 · 2025-03-18 10:46:58 -04:00
422 changed files with 73619 additions and 708 deletions
--- a/.github/actions/build-manylinux-aarch64/Dockerfile
+++ b/.github/actions/build-manylinux-aarch64/Dockerfile
@ -0,0 +1,3 @@
+FROM quay.io/pypa/manylinux2014_aarch64
+COPY entrypoint.sh /entrypoint.sh
+ENTRYPOINT ["/entrypoint.sh"]
--- a/.github/actions/build-manylinux-aarch64/action.yml
+++ b/.github/actions/build-manylinux-aarch64/action.yml
@ -0,0 +1,5 @@
+name: manylinux build (aarch64)
+description: Builds Codon on manylinux (aarch64)
+runs:
+  using: docker
+  image: Dockerfile
--- a/.github/actions/build-manylinux-aarch64/entrypoint.sh
+++ b/.github/actions/build-manylinux-aarch64/entrypoint.sh
@ -4,13 +4,12 @@ set -e
 # setup
 cd /github/workspace
 yum -y update
-yum -y install python3 python3-devel
+yum -y install python3 python3-devel gcc-gfortran

 # env
 export PYTHONPATH=$(pwd)/test/python
 export CODON_PYTHON=$(python3 test/python/find-python-library.py)
-python3 -m pip install -Iv pip==21.3.1
-python3 -m pip install numpy
+python3 -m pip install -Iv pip==21.3.1 numpy==1.17.5

 # deps
 if [ ! -d ./llvm ]; then
@ -22,6 +21,7 @@ mkdir build
 export CC="$(pwd)/llvm/bin/clang"
 export CXX="$(pwd)/llvm/bin/clang++"
 export LLVM_DIR=$(llvm/bin/llvm-config --cmakedir)
+export CODON_SYSTEM_LIBRARIES=/usr/lib64
 (cd build && cmake .. -DCMAKE_BUILD_TYPE=Release \
                      -DCMAKE_C_COMPILER=${CC} \
                      -DCMAKE_CXX_COMPILER=${CXX})
@ -44,6 +44,7 @@ build/codon_test

 # package
 export CODON_BUILD_ARCHIVE=codon-$(uname -s | awk '{print tolower($0)}')-$(uname -m).tar.gz
-rm -rf codon-deploy/lib/libfmt.a codon-deploy/lib/pkgconfig codon-deploy/lib/cmake codon-deploy/python/codon.egg-info codon-deploy/python/dist codon-deploy/python/build
+rm -rf codon-deploy/lib/libfmt.a codon-deploy/lib/pkgconfig codon-deploy/lib/cmake \
+       codon-deploy/python/codon.egg-info codon-deploy/python/dist codon-deploy/python/build
 tar -czf ${CODON_BUILD_ARCHIVE} codon-deploy
 du -sh codon-deploy
--- a/.github/actions/build-manylinux-x86_64/Dockerfile
+++ b/.github/actions/build-manylinux-x86_64/Dockerfile
--- a/.github/actions/build-manylinux-x86_64/action.yml
+++ b/.github/actions/build-manylinux-x86_64/action.yml
@ -0,0 +1,5 @@
+name: manylinux build (x86_64)
+description: Builds Codon on manylinux (x86_64)
+runs:
+  using: docker
+  image: Dockerfile
--- a/.github/actions/build-manylinux-x86_64/entrypoint.sh
+++ b/.github/actions/build-manylinux-x86_64/entrypoint.sh
@ -0,0 +1,50 @@
+#!/bin/sh -l
+set -e
+
+# setup
+cd /github/workspace
+yum -y update
+yum -y install python3 python3-devel gcc-gfortran
+
+# env
+export PYTHONPATH=$(pwd)/test/python
+export CODON_PYTHON=$(python3 test/python/find-python-library.py)
+python3 -m pip install -Iv pip==21.3.1 numpy==1.17.5
+
+# deps
+if [ ! -d ./llvm ]; then
+  /bin/bash scripts/deps.sh 2;
+fi
+
+# build
+mkdir build
+export CC="$(pwd)/llvm/bin/clang"
+export CXX="$(pwd)/llvm/bin/clang++"
+export LLVM_DIR=$(llvm/bin/llvm-config --cmakedir)
+export CODON_SYSTEM_LIBRARIES=/usr/lib64
+(cd build && cmake .. -DCMAKE_BUILD_TYPE=Release \
+                      -DCMAKE_C_COMPILER=${CC} \
+                      -DCMAKE_CXX_COMPILER=${CXX})
+cmake --build build --config Release -- VERBOSE=1
+cmake --install build --prefix=codon-deploy
+
+# build cython
+export PATH=$PATH:$(pwd)/llvm/bin
+python3 -m pip install cython wheel astunparse
+(cd codon-deploy/python && python3 setup.py sdist)
+CODON_DIR=$(pwd)/codon-deploy python3 -m pip install -v codon-deploy/python/dist/*.gz
+python3 test/python/cython_jit.py
+
+# test
+export LD_LIBRARY_PATH=$(pwd)/build:$LD_LIBRARY_PATH
+export PYTHONPATH=$(pwd):$PYTHONPATH
+export CODON_PATH=$(pwd)/stdlib
+ln -s build/libcodonrt.so .
+build/codon_test
+
+# package
+export CODON_BUILD_ARCHIVE=codon-$(uname -s | awk '{print tolower($0)}')-$(uname -m).tar.gz
+rm -rf codon-deploy/lib/libfmt.a codon-deploy/lib/pkgconfig codon-deploy/lib/cmake \
+       codon-deploy/python/codon.egg-info codon-deploy/python/dist codon-deploy/python/build
+tar -czf ${CODON_BUILD_ARCHIVE} codon-deploy
+du -sh codon-deploy
--- a/.github/actions/build-manylinux/action.yml
+++ b/.github/actions/build-manylinux/action.yml
@ -1,5 +0,0 @@
-name: manylinux build
-description: Builds Codon on manylinux
-runs:
-  using: docker
-  image: Dockerfile
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -26,7 +26,12 @@ jobs:
        uses: ncipollo/release-action@v1

  manylinux:
-    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        arch:
+          - x86_64
+          # - aarch64
+    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-arm-latest' || 'ubuntu-latest' }}
    name: Codon CI (manylinux)
    needs: create_release
    permissions:
@ -39,10 +44,15 @@ jobs:
        uses: actions/cache@v4
        with:
          path: llvm
-          key: manylinux-llvm
+          key: manylinux-${{ matrix.arch }}-llvm

-      - name: Main
-        uses: ./.github/actions/build-manylinux
+      - name: Main x86_64
+        if: matrix.arch == 'x86_64'
+        uses: ./.github/actions/build-manylinux-x86_64
+
+      - name: Main aarch64
+        if: matrix.arch == 'aarch64'
+        uses: ./.github/actions/build-manylinux-aarch64

      - name: Upload Release Asset
        if: contains(github.ref, 'tags/v')
@ -66,7 +76,8 @@ jobs:
      matrix:
        os:
          - ubuntu-latest
-          - macos-12
+          - macos-latest
+          # - ubuntu-arm-latest
    runs-on: ${{ matrix.os }}
    name: Codon CI
    needs: create_release
@ -79,23 +90,49 @@ jobs:
        with:
          python-version: '3.9'

-      - name: Linux Setup
-        if: startsWith(matrix.os, 'ubuntu')
+      - name: x86_64 Linux Setup
+        if: startsWith(matrix.os, 'ubuntu') && matrix.os != 'ubuntu-arm-latest'
        run: |
+          sudo apt update
+          sudo apt install -y gfortran libgfortran5 lsb-release wget software-properties-common gnupg
+          wget https://apt.llvm.org/llvm.sh
+          sudo chmod +x llvm.sh
+          sudo ./llvm.sh 17
          echo "LIBEXT=so" >> $GITHUB_ENV
          echo "OS_NAME=linux" >> $GITHUB_ENV
+          echo "CODON_SYSTEM_LIBRARIES=/usr/lib/x86_64-linux-gnu" >> $GITHUB_ENV
+          echo "CC=clang-17" >> $GITHUB_ENV
+          echo "CXX=clang++-17" >> $GITHUB_ENV
+
+      - name: Arm Linux Setup
+        if: matrix.os == 'ubuntu-arm-latest'
+        run: |
+          sudo apt update
+          sudo apt install -y gfortran libgfortran5 lsb-release wget software-properties-common gnupg
+          wget https://apt.llvm.org/llvm.sh
+          sudo chmod +x llvm.sh
+          sudo ./llvm.sh 17
+          echo "LIBEXT=so" >> $GITHUB_ENV
+          echo "OS_NAME=linux" >> $GITHUB_ENV
+          echo "CODON_SYSTEM_LIBRARIES=/usr/lib/aarch64-linux-gnu" >> $GITHUB_ENV
+          echo "CC=clang-17" >> $GITHUB_ENV
+          echo "CXX=clang++-17" >> $GITHUB_ENV

      - name: macOS Setup
        if: startsWith(matrix.os, 'macos')
        run: |
-          brew install automake
          echo "LIBEXT=dylib" >> $GITHUB_ENV
          echo "OS_NAME=osx" >> $GITHUB_ENV
+          echo "CODON_SYSTEM_LIBRARIES=$(brew --prefix gcc)/lib/gcc/current" >> $GITHUB_ENV
+          echo "CC=clang" >> $GITHUB_ENV
+          echo "CXX=clang++" >> $GITHUB_ENV
+          echo "FC=gfortran-12" >> $GITHUB_ENV

      - name: Set up Python
        run: |
          python -m pip install --upgrade pip setuptools wheel
-          python -m pip install numpy cython wheel astunparse
+          python -m pip install cython wheel astunparse
+          python -m pip install --force-reinstall -v "numpy==1.26.4"
          which python
          which pip
          echo "CODON_PYTHON=$(python test/python/find-python-library.py)" >> $GITHUB_ENV
@ -105,14 +142,11 @@ jobs:
        uses: actions/cache@v4
        with:
          path: llvm
-          key: ${{ runner.os }}-llvm
+          key: ${{ runner.os }}-${{ matrix.os }}-llvm

      - name: Build Dependencies
        if: steps.cache-deps.outputs.cache-hit != 'true'
        run: ./scripts/deps.sh 2
-        env:
-          CC: clang
-          CXX: clang++

      - name: Build
        run: |
@ -123,18 +157,12 @@ jobs:
                                -DCMAKE_CXX_COMPILER=${CXX})
          cmake --build build --config Release -- VERBOSE=1
          cmake --install build --prefix=codon-deploy
-        env:
-          CC: clang
-          CXX: clang++

      - name: Build Cython
        run: |
          (cd codon-deploy/python && python3 setup.py sdist)
          CODON_DIR=$(pwd)/codon-deploy python -m pip install -v codon-deploy/python/dist/*.gz
-          python test/python/cython_jit.py
-        env:
-          CC: clang
-          CXX: clang++
+          CODON_PATH=$(pwd)/codon-deploy/lib/codon/stdlib python test/python/cython_jit.py

      - name: Test
        run: |
@ -151,10 +179,15 @@ jobs:
        run: |
          echo "CODON_BUILD_ARCHIVE=codon-$(uname -s | awk '{print tolower($0)}')-$(uname -m).tar.gz" >> $GITHUB_ENV

+      - name: Codesign (macOS)
+        if: startsWith(matrix.os, 'macos')
+        run: |
+          codesign -f -s - codon-deploy/bin/codon codon-deploy/lib/codon/*.dylib
+
      - name: Prepare Artifacts
        run: |
          cp -rf codon-deploy/python/dist .
-          rm -rf codon-deploy/lib/libfmt.a codon-deploy/lib/pkgconfig codon-deploy/lib/cmake codon-deploy/python/codon.egg-info codon-deploy/python/dist codon-deploy/python/build
+          rm -rf codon-deploy/lib/libfmt.a codon-deploy/lib/pkgconfig codon-deploy/lib/cmake codon-deploy/python/codon_jit.egg-info codon-deploy/python/build
          tar -czf ${CODON_BUILD_ARCHIVE} codon-deploy
          du -sh codon-deploy

@ -165,24 +198,31 @@ jobs:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        with:
          upload_url: ${{ needs.create_release.outputs.upload_url }}
-          asset_path: ./codon-darwin-x86_64.tar.gz
-          asset_name: codon-darwin-x86_64.tar.gz
+          asset_path: ./codon-darwin-arm64.tar.gz
+          asset_name: codon-darwin-arm64.tar.gz
          asset_content_type: application/gzip

      - name: Upload Artifacts
        if: startsWith(matrix.os, 'macos')
        uses: actions/upload-artifact@v4
        with:
-          name: ${{ matrix.os }}-x86_64
-          path: codon-darwin-x86_64.tar.gz
+          name: ${{ matrix.os }}-arm64
+          path: codon-darwin-arm64.tar.gz

      - name: Upload Artifacts
-        if: startsWith(matrix.os, 'ubuntu')
+        if: startsWith(matrix.os, 'ubuntu') && matrix.os != 'ubuntu-arm-latest'
        uses: actions/upload-artifact@v4
        with:
          name: ${{ matrix.os }}-x86_64
          path: codon-linux-x86_64.tar.gz

-      # - name: Publish Package
-      #   if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') && startsWith(matrix.os, 'ubuntu')
-      #   uses: pypa/gh-action-pypi-publish@release/v1
+      - name: Upload Artifacts
+        if: matrix.os == 'ubuntu-arm-latest'
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ matrix.os }}-arm64
+          path: codon-linux-arm64.tar.gz
+
+      - name: Publish Package
+        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') && startsWith(matrix.os, 'ubuntu')
+        uses: pypa/gh-action-pypi-publish@release/v1
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,10 +1,10 @@
 cmake_minimum_required(VERSION 3.14)
 project(
  Codon
-  VERSION "0.17.0"
+  VERSION "0.18.2"
  HOMEPAGE_URL "https://github.com/exaloop/codon"
  DESCRIPTION "high-performance, extensible Python compiler")
-set(CODON_JIT_PYTHON_VERSION "0.2.0")
+set(CODON_JIT_PYTHON_VERSION "0.3.2")
 configure_file("${PROJECT_SOURCE_DIR}/cmake/config.h.in"
               "${PROJECT_SOURCE_DIR}/codon/config/config.h")
 configure_file("${PROJECT_SOURCE_DIR}/cmake/config.py.in"
@ -48,10 +48,8 @@ include(${CMAKE_SOURCE_DIR}/cmake/deps.cmake)
 set(CMAKE_BUILD_WITH_INSTALL_RPATH ON)
 if(APPLE)
  set(CMAKE_INSTALL_RPATH "@loader_path;@loader_path/../lib/codon")
-  set(STATIC_LIBCPP "")
 else()
  set(CMAKE_INSTALL_RPATH "$ORIGIN:$ORIGIN/../lib/codon")
-  set(STATIC_LIBCPP "-static-libstdc++")
 endif()

 add_executable(peg2cpp codon/util/peg2cpp.cpp)
@ -73,17 +71,72 @@ set(CODON_JUPYTER_FILES codon/util/jupyter.h codon/util/jupyter.cpp)
 add_library(codon_jupyter SHARED ${CODON_JUPYTER_FILES})

 # Codon runtime library
+add_library(codonfloat STATIC
+            codon/runtime/floatlib/extenddftf2.c
+            codon/runtime/floatlib/fp_trunc.h
+            codon/runtime/floatlib/truncdfhf2.c
+            codon/runtime/floatlib/extendhfsf2.c
+            codon/runtime/floatlib/int_endianness.h
+            codon/runtime/floatlib/truncdfsf2.c
+            codon/runtime/floatlib/extendhftf2.c
+            codon/runtime/floatlib/int_lib.h
+#            codon/runtime/floatlib/truncsfbf2.c
+            codon/runtime/floatlib/extendsfdf2.c
+            codon/runtime/floatlib/int_math.h
+            codon/runtime/floatlib/truncsfhf2.c
+            codon/runtime/floatlib/extendsftf2.c
+            codon/runtime/floatlib/int_types.h
+            codon/runtime/floatlib/trunctfdf2.c
+            codon/runtime/floatlib/fp_extend.h
+            codon/runtime/floatlib/int_util.h
+            codon/runtime/floatlib/trunctfhf2.c
+            codon/runtime/floatlib/fp_lib.h
+#            codon/runtime/floatlib/truncdfbf2.c
+            codon/runtime/floatlib/trunctfsf2.c)
+target_compile_options(codonfloat PRIVATE -O3)
+target_compile_definitions(codonfloat PRIVATE COMPILER_RT_HAS_FLOAT16)
+
 set(CODONRT_FILES codon/runtime/lib.h codon/runtime/lib.cpp
                  codon/runtime/re.cpp codon/runtime/exc.cpp
-                  codon/runtime/gpu.cpp)
+                  codon/runtime/gpu.cpp codon/runtime/numpy/sort.cpp
+                  codon/runtime/numpy/loops.cpp codon/runtime/numpy/zmath.cpp)
 add_library(codonrt SHARED ${CODONRT_FILES})
-add_dependencies(codonrt zlibstatic gc backtrace bz2 liblzma re2 fast_float)
+add_dependencies(codonrt zlibstatic gc backtrace bz2 liblzma
+                         re2 hwy hwy_contrib fast_float codonfloat)
+
+if(DEFINED ENV{CODON_SYSTEM_LIBRARIES})
+  if(APPLE)
+    set(copied_libgfortran "${CMAKE_BINARY_DIR}/libgfortran.5${CMAKE_SHARED_LIBRARY_SUFFIX}")
+    set(copied_libquadmath "${CMAKE_BINARY_DIR}/libquadmath.0${CMAKE_SHARED_LIBRARY_SUFFIX}")
+    set(copied_libgcc      "${CMAKE_BINARY_DIR}/libgcc_s.1.1${CMAKE_SHARED_LIBRARY_SUFFIX}")
+  else()
+    set(copied_libgfortran "${CMAKE_BINARY_DIR}/libgfortran${CMAKE_SHARED_LIBRARY_SUFFIX}.5")
+    set(copied_libquadmath "${CMAKE_BINARY_DIR}/libquadmath${CMAKE_SHARED_LIBRARY_SUFFIX}.0")
+    set(copied_libgcc      "${CMAKE_BINARY_DIR}/libgcc_s${CMAKE_SHARED_LIBRARY_SUFFIX}.1")
+  endif()
+
+  add_custom_command(
+    OUTPUT ${copied_libgfortran}
+    DEPENDS "${CMAKE_SOURCE_DIR}/scripts/get_system_libs.sh"
+    COMMAND ${CMAKE_SOURCE_DIR}/scripts/get_system_libs.sh "$ENV{CODON_SYSTEM_LIBRARIES}" ${CMAKE_BINARY_DIR}
+    COMMENT "Copying system libraries to build directory")
+
+  add_custom_target(copy_libraries ALL DEPENDS ${copied_libgfortran})
+  add_dependencies(codonrt copy_libraries)
+
+  add_library(libgfortran SHARED IMPORTED)
+  set_target_properties(libgfortran PROPERTIES IMPORTED_LOCATION ${copied_libgfortran})
+  target_link_libraries(codonrt PRIVATE libgfortran)
+else()
+  message(FATAL_ERROR "Set 'CODON_SYSTEM_LIBRARIES' to the directory containing system libraries.")
+endif()
+
 target_include_directories(codonrt PRIVATE ${backtrace_SOURCE_DIR}
                                           ${re2_SOURCE_DIR}
+                                           ${highway_SOURCE_DIR}
                                           "${gc_SOURCE_DIR}/include"
                                           "${fast_float_SOURCE_DIR}/include" runtime)
-target_link_libraries(codonrt PRIVATE fmt omp backtrace ${STATIC_LIBCPP}
-                                      LLVMSupport)
+target_link_libraries(codonrt PRIVATE fmt omp backtrace LLVMSupport)
 if(APPLE)
  target_link_libraries(
    codonrt
@ -91,13 +144,19 @@ if(APPLE)
            -Wl,-force_load,$<TARGET_FILE:gc>
            -Wl,-force_load,$<TARGET_FILE:bz2>
            -Wl,-force_load,$<TARGET_FILE:liblzma>
-            -Wl,-force_load,$<TARGET_FILE:re2>)
+            -Wl,-force_load,$<TARGET_FILE:re2>
+            -Wl,-force_load,$<TARGET_FILE:hwy>
+            -Wl,-force_load,$<TARGET_FILE:hwy_contrib>
+            -Wl,-force_load,$<TARGET_FILE:codonfloat>)
+  target_link_libraries(codonrt PUBLIC "-framework Accelerate")
 else()
+  add_dependencies(codonrt openblas)
  target_link_libraries(
    codonrt
    PRIVATE -Wl,--whole-archive $<TARGET_FILE:zlibstatic> $<TARGET_FILE:gc>
            $<TARGET_FILE:bz2> $<TARGET_FILE:liblzma> $<TARGET_FILE:re2>
-            -Wl,--no-whole-archive)
+            $<TARGET_FILE:openblas> $<TARGET_FILE:hwy> $<TARGET_FILE:hwy_contrib>
+            $<TARGET_FILE:codonfloat> -Wl,--no-whole-archive)
 endif()
 if(ASAN)
  target_compile_options(
@ -173,6 +232,10 @@ set(CODON_HPPFILES
    codon/cir/llvm/gpu.h
    codon/cir/llvm/llvisitor.h
    codon/cir/llvm/llvm.h
+    codon/cir/llvm/native/native.h
+    codon/cir/llvm/native/targets/aarch64.h
+    codon/cir/llvm/native/targets/target.h
+    codon/cir/llvm/native/targets/x86.h
    codon/cir/llvm/optimize.h
    codon/cir/module.h
    codon/cir/pyextension.h
@ -187,6 +250,7 @@ set(CODON_HPPFILES
    codon/cir/transform/folding/rule.h
    codon/cir/transform/lowering/imperative.h
    codon/cir/transform/lowering/pipeline.h
+    codon/cir/transform/numpy/numpy.h
    codon/cir/transform/manager.h
    codon/cir/transform/parallel/openmp.h
    codon/cir/transform/parallel/schedule.h
@ -283,6 +347,9 @@ set(CODON_CPPFILES
    codon/cir/instr.cpp
    codon/cir/llvm/gpu.cpp
    codon/cir/llvm/llvisitor.cpp
+    codon/cir/llvm/native/native.cpp
+    codon/cir/llvm/native/targets/aarch64.cpp
+    codon/cir/llvm/native/targets/x86.cpp
    codon/cir/llvm/optimize.cpp
    codon/cir/module.cpp
    codon/cir/transform/cleanup/canonical.cpp
@ -294,6 +361,9 @@ set(CODON_CPPFILES
    codon/cir/transform/folding/folding.cpp
    codon/cir/transform/lowering/imperative.cpp
    codon/cir/transform/lowering/pipeline.cpp
+    codon/cir/transform/numpy/expr.cpp
+    codon/cir/transform/numpy/forward.cpp
+    codon/cir/transform/numpy/numpy.cpp
    codon/cir/transform/manager.cpp
    codon/cir/transform/parallel/openmp.cpp
    codon/cir/transform/parallel/schedule.cpp
@ -362,11 +432,7 @@ llvm_map_components_to_libnames(
  TransformUtils
  Vectorize
  Passes)
-if(APPLE)
-  target_link_libraries(codonc PRIVATE ${LLVM_LIBS} fmt dl codonrt)
-else()
-  target_link_libraries(codonc PRIVATE ${STATIC_LIBCPP} ${LLVM_LIBS} fmt dl codonrt)
-endif()
+target_link_libraries(codonc PRIVATE ${LLVM_LIBS} fmt dl codonrt)

 # Gather headers
 add_custom_target(
@ -399,18 +465,24 @@ add_custom_target(
  COMMAND
    ${CMAKE_COMMAND} -E copy
    "${CMAKE_BINARY_DIR}/libomp${CMAKE_SHARED_LIBRARY_SUFFIX}"
-    "${CMAKE_BINARY_DIR}/lib/codon")
+    "${CMAKE_BINARY_DIR}/lib/codon"
+  COMMAND
+    ${CMAKE_COMMAND} -E copy ${copied_libgfortran} "${CMAKE_BINARY_DIR}/lib/codon"
+  COMMAND
+    ${CMAKE_COMMAND} -E copy ${copied_libquadmath} "${CMAKE_BINARY_DIR}/lib/codon"
+  COMMAND
+    ${CMAKE_COMMAND} -E copy ${copied_libgcc} "${CMAKE_BINARY_DIR}/lib/codon")
 add_dependencies(libs codonrt codonc)

 # Codon command-line tool
 add_executable(codon codon/app/main.cpp)
-target_link_libraries(codon PUBLIC ${STATIC_LIBCPP} fmt codonc codon_jupyter Threads::Threads)
+target_link_libraries(codon PUBLIC fmt codonc codon_jupyter Threads::Threads)

 # Codon test Download and unpack googletest at configure time
 include(FetchContent)
 FetchContent_Declare(
  googletest
-  URL https://github.com/google/googletest/archive/609281088cfefc76f9d0ce82e1ff6c30cc3591e5.zip
+  URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip
 )
 # For Windows: Prevent overriding the parent project's compiler/linker settings
 set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
@ -442,6 +514,9 @@ target_compile_definitions(codon_test

 install(TARGETS codonrt codonc codon_jupyter DESTINATION lib/codon)
 install(FILES ${CMAKE_BINARY_DIR}/libomp${CMAKE_SHARED_LIBRARY_SUFFIX} DESTINATION lib/codon)
+install(FILES ${copied_libgfortran} DESTINATION lib/codon)
+install(FILES ${copied_libquadmath} DESTINATION lib/codon)
+install(FILES ${copied_libgcc} DESTINATION lib/codon)
 install(TARGETS codon DESTINATION bin)
 install(DIRECTORY ${CMAKE_BINARY_DIR}/include/codon DESTINATION include)
 install(DIRECTORY ${CMAKE_SOURCE_DIR}/stdlib DESTINATION lib/codon)
--- a/240
+++ b/240
@ -1,91 +1,201 @@
-Business Source License 1.1
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/

-License text copyright (c) 2017 MariaDB Corporation Ab, All Rights Reserved.
-"Business Source License" is a trademark of MariaDB Corporation Ab.
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

-----------------------------------------------------------------------------
+   1. Definitions.

-Parameters
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.

-Licensor:             Exaloop, Inc.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.

-Licensed Work:        Codon compiler, runtime, and standard library
-                      The Licensed Work is (c) 2022-2024 Exaloop Inc.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.

-Additional Use Grant: None
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.

-Change Date:          2028-03-01
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.

-Change License:       Apache License, Version 2.0
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.

-For information about alternative licensing arrangements for the Software,
-please visit: https://exaloop.io/
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).

-----------------------------------------------------------------------------
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.

-Terms
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."

-The Licensor hereby grants you the right to copy, modify, create derivative
-works, redistribute, and make non-production use of the Licensed Work. The
-Licensor may make an Additional Use Grant, above, permitting limited
-production use.
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.

-Effective on the Change Date, or the fourth anniversary of the first publicly
-available distribution of a specific version of the Licensed Work under this
-License, whichever comes first, the Licensor hereby grants you rights under
-the terms of the Change License, and the rights granted in the paragraph
-above terminate.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.

-If your use of the Licensed Work does not comply with the requirements
-currently in effect as described in this License, you must purchase a
-commercial license from the Licensor, its affiliated entities, or authorized
-resellers, or you must refrain from using the Licensed Work.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.

-All copies of the original and modified Licensed Work, and derivative works
-of the Licensed Work, are subject to this License. This License applies
-separately for each version of the Licensed Work and the Change Date may vary
-for each version of the Licensed Work released by Licensor.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:

-You must conspicuously display this License on each original or modified copy
-of the Licensed Work. If you receive the Licensed Work in original or
-modified form from a third party, the terms and conditions set forth in this
-License apply to your use of that work.
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and

-Any use of the Licensed Work in violation of this License will automatically
-terminate your rights under this License for the current and all other
-versions of the Licensed Work.
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and

-This License does not grant you any right in any trademark or logo of
-Licensor or its affiliates (provided that you may use a trademark or logo of
-Licensor as expressly required by this License).
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and

-TO THE EXTENT PERMITTED BY APPLICABLE LAW, THE LICENSED WORK IS PROVIDED ON
-AN "AS IS" BASIS. LICENSOR HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS,
-EXPRESS OR IMPLIED, INCLUDING (WITHOUT LIMITATION) WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND
-TITLE.
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.

-MariaDB hereby grants you permission to use this License's text to license
-your works, and to refer to it using the trademark "Business Source License",
-as long as you comply with the Covenants of Licensor below.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.

-Covenants of Licensor
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.

-In consideration of the right to use this License's text and the "Business
-Source License" name and trademark, Licensor covenants to MariaDB, and to all
-other recipients of the licensed work to be provided by Licensor:
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.

-1. To specify as the Change License the GPL Version 2.0 or any later version,
-   or a license that is compatible with GPL Version 2.0 or a later version,
-   where "compatible" means that software provided under the Change License can
-   be included in a program with software provided under GPL Version 2.0 or a
-   later version. Licensor may specify additional Change Licenses without
-   limitation.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.

-2. To either: (a) specify an additional grant of rights to use that does not
-   impose any additional restriction on the right granted in this License, as
-   the Additional Use Grant; or (b) insert the text "None".
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.

-3. To specify a Change Date.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.

-4. Not to modify this License in any other way.
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/README.md
+++ b/README.md
@ -1,19 +1,19 @@
-<p align="center">
- <img src="docs/img/codon.png?raw=true" width="600" alt="Codon"/>
-</p>
+<h1 align="center">
+ <img src="docs/img/codon-banner.svg" alt="Codon banner"/>
+</h1>

 <h3 align="center">
  <a href="https://docs.exaloop.io/codon" target="_blank"><b>Docs</b></a>
  &nbsp;&#183;&nbsp;
  <a href="https://docs.exaloop.io/codon/general/faq" target="_blank"><b>FAQ</b></a>
  &nbsp;&#183;&nbsp;
-  <a href="https://blog.exaloop.io" target="_blank"><b>Blog</b></a>
+  <a href="https://exaloop.io/blog" target="_blank"><b>Blog</b></a>
  &nbsp;&#183;&nbsp;
  <a href="https://join.slack.com/t/exaloop/shared_invite/zt-1jusa4kc0-T3rRWrrHDk_iZ1dMS8s0JQ" target="_blank">Chat</a>
  &nbsp;&#183;&nbsp;
  <a href="https://docs.exaloop.io/codon/general/roadmap" target="_blank">Roadmap</a>
  &nbsp;&#183;&nbsp;
-  <a href="https://exaloop.io/benchmarks" target="_blank">Benchmarks</a>
+  <a href="https://exaloop.io/#benchmarks" target="_blank">Benchmarks</a>
 </h3>

 <a href="https://github.com/exaloop/codon/actions/workflows/ci.yml">
@ -21,7 +21,7 @@
       alt="Build Status">
 </a>

-## What is Codon?
+# What is Codon?

 Codon is a high-performance Python implementation that compiles to native machine code without
 any runtime overhead. Typical speedups over vanilla Python are on the order of 10-100x or more, on
@ -32,7 +32,7 @@ higher still.
 *Think of Codon as Python reimagined for static, ahead-of-time compilation, built from the ground
 up with best possible performance in mind.*

-### Goals
+## Goals

 - :bulb: **No learning curve:** Be as close to CPython as possible in terms of syntax, semantics and libraries
 - :rocket: **Top-notch performance:** At *least* on par with low-level languages like C, C++ or Rust
@ -41,7 +41,7 @@ up with best possible performance in mind.*
  and libraries
 - :battery: **Interoperability:** Full interoperability with Python's ecosystem of packages and libraries

-### Non-goals
+## Non-goals

 - :x: *Drop-in replacement for CPython:* Codon is not a drop-in replacement for CPython. There are some
  aspects of Python that are not suitable for static compilation — we don't support these in Codon.
@ -54,55 +54,62 @@ up with best possible performance in mind.*
  features as much as possible. While Codon does add some new syntax in a couple places (e.g. to express
  parallelism), we try to make it as familiar and intuitive as possible.

-## Install
+## How it works

-Pre-built binaries for Linux (x86_64) and macOS (x86_64 and arm64) are available alongside [each release](https://github.com/exaloop/codon/releases).
-Download and install with:
+<p align="center">
+ <img src="docs/img/codon-figure.svg" width="90%" alt="Codon figure"/>
+</p>
+
+# Quick start
+
+Download and install Codon with this command:

 ```bash
 /bin/bash -c "$(curl -fsSL https://exaloop.io/install.sh)"
 ```

-Or you can [build from source](https://docs.exaloop.io/codon/advanced/build).
+After following the prompts, the `codon` command will be available to use. For example:

-## Examples
+- To run a program: `codon run file.py`
+- To run a program with optimizations enabled: `codon run -release file.py`
+- To compile to an executable: `codon build -release file.py`
+- To generate LLVM IR: `codon build -release -llvm file.py`

-Codon is a Python-compatible language, and many Python programs will work with few if any modifications:
+Many more options are available and described in [the docs](https://docs.exaloop.io/codon/general/intro).
+
+Alternatively, you can [build from source](https://docs.exaloop.io/codon/advanced/build).
+
+# Examples
+
+## Basics
+
+Codon supports much of Python, and many Python programs will work with few if any modifications.
+Here's a simple script `fib.py` that computes the 40th Fibonacci number...
+
+``` python
+from time import time

-```python
 def fib(n):
-    a, b = 0, 1
-    while a < n:
-        print(a, end=' ')
-        a, b = b, a+b
-    print()
-fib(1000)
+    return n if n < 2 else fib(n - 1) + fib(n - 2)
+
+t0 = time()
+ans = fib(40)
+t1 = time()
+print(f'Computed fib(40) = {ans} in {t1 - t0} seconds.')
 ```

-The `codon` compiler has a number of options and modes:
+... run through Python and Codon:

-```bash
-# compile and run the program
-codon run fib.py
-# 0 1 1 2 3 5 8 13 21 34 55 89 144 233 377 610 987
-
-# compile and run the program with optimizations enabled
-codon run -release fib.py
-# 0 1 1 2 3 5 8 13 21 34 55 89 144 233 377 610 987
-
-# compile to executable with optimizations enabled
-codon build -release -exe fib.py
-./fib
-# 0 1 1 2 3 5 8 13 21 34 55 89 144 233 377 610 987
-
-# compile to LLVM IR file with optimizations enabled
-codon build -release -llvm fib.py
-# outputs file fib.ll
+```
+$ python3 fib.py
+Computed fib(40) = 102334155 in 17.979357957839966 seconds.
+$ codon run -release fib.py
+Computed fib(40) = 102334155 in 0.275645 seconds.
 ```

-See [the docs](https://docs.exaloop.io/codon/general/intro) for more options and examples.
+## Using Python libraries

-You can import and use any Python package from Codon. For example:
+You can import and use any Python package from Codon via `from python import`. For example:

 ```python
 from python import matplotlib.pyplot as plt
@ -112,11 +119,13 @@ plt.show()
 ```

 (Just remember to set the `CODON_PYTHON` environment variable to the CPython shared library,
-as explained in the [the docs](https://docs.exaloop.io/codon/interoperability/python).)
+as explained in the [the Python interoperability docs](https://docs.exaloop.io/codon/interoperability/python).)

-This prime counting example showcases Codon's [OpenMP](https://www.openmp.org/) support, enabled
-with the addition of one line. The `@par` annotation tells the compiler to parallelize the
-following `for`-loop, in this case using a dynamic schedule, chunk size of 100, and 16 threads.
+## Parallelism
+
+Codon supports native multithreading via [OpenMP](https://www.openmp.org/). The `@par` annotation
+in the code below tells the compiler to parallelize the following `for`-loop, in this case using
+a dynamic schedule, chunk size of 100, and 16 threads.

 ```python
 from sys import argv
@ -139,7 +148,10 @@ for i in range(2, limit):
 print(total)
 ```

-Codon supports writing and executing GPU kernels. Here's an example that computes the
+Note that Codon automatically turns the `total += 1` statement in the loop body into an atomic
+reduction to avoid race conditions. Learn more in the [multithreading docs](https://docs.exaloop.io/codon/advanced/parallel).
+
+Codon also supports writing and executing GPU kernels. Here's an example that computes the
 [Mandelbrot set](https://en.wikipedia.org/wiki/Mandelbrot_set):

 ```python
@ -169,8 +181,47 @@ def mandelbrot(pixels):
 mandelbrot(pixels, grid=(N*N)//1024, block=1024)
 ```

-GPU programming can also be done using the `@par` syntax with `@par(gpu=True)`.
+GPU programming can also be done using the `@par` syntax with `@par(gpu=True)`. See the
+[GPU programming docs](https://docs.exaloop.io/codon/advanced/gpu) for more details.

-## Documentation
+## NumPy support

-Please see [docs.exaloop.io](https://docs.exaloop.io/codon) for in-depth documentation.
+Codon includes a feature-complete, fully-compiled native NumPy implementation. It uses the same
+API as NumPy, but re-implements everything in Codon itself, allowing for a range of optimizations
+and performance improvements.
+
+Here's an example NumPy program that approximates $\pi$ using random numbers...
+
+``` python
+import time
+import numpy as np
+
+rng = np.random.default_rng(seed=0)
+x = rng.random(500_000_000)
+y = rng.random(500_000_000)
+
+t0 = time.time()
+# pi ~= 4 x (fraction of points in circle)
+pi = ((x-1)**2 + (y-1)**2 < 1).sum() * (4 / len(x))
+t1 = time.time()
+
+print(f'Computed pi~={pi:.4f} in {t1 - t0:.2f} sec')
+```
+
+... run through Python and Codon:
+
+```
+$ python3 pi.py
+Computed pi~=3.1417 in 2.25 sec
+$ codon run -release pi.py
+Computed pi~=3.1417 in 0.43 sec
+```
+
+Codon can speed up NumPy code through general-purpose and NumPy-specific compiler optimizations,
+including inlining, fusion, memory allocation elision and more. Furthermore, Codon's NumPy
+implementation works with its multithreading and GPU capabilities, and can even integrate with
+[PyTorch](https://pytorch.org). Learn more in the [Codon-NumPy docs](https://docs.exaloop.io/codon/interoperability/numpy).
+
+# Documentation
+
+Please see [docs.exaloop.io](https://docs.exaloop.io) for in-depth documentation.
--- a/cmake/deps.cmake
+++ b/cmake/deps.cmake
@ -1,8 +1,8 @@
-set(CPM_DOWNLOAD_VERSION 0.32.3)
+set(CPM_DOWNLOAD_VERSION 0.40.8)
 set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
 if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION}))
    message(STATUS "Downloading CPM.cmake...")
-    file(DOWNLOAD https://github.com/TheLartians/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake ${CPM_DOWNLOAD_LOCATION})
+    file(DOWNLOAD https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake ${CPM_DOWNLOAD_LOCATION})
 endif()
 include(${CPM_DOWNLOAD_LOCATION})

@ -77,9 +77,9 @@ endif()

 CPMAddPackage(
    NAME bdwgc
-    GITHUB_REPOSITORY "ivmai/bdwgc"
+    GITHUB_REPOSITORY "exaloop/bdwgc"
    VERSION 8.0.5
-    GIT_TAG d0ba209660ea8c663e06d9a68332ba5f42da54ba
+    GIT_TAG e16c67244aff26802203060422545d38305e0160
    EXCLUDE_FROM_ALL YES
    OPTIONS "CMAKE_POSITION_INDEPENDENT_CODE ON"
            "BUILD_SHARED_LIBS OFF"
@ -163,3 +163,28 @@ CPMAddPackage(
    GITHUB_REPOSITORY "fastfloat/fast_float"
    GIT_TAG v6.1.1
    EXCLUDE_FROM_ALL YES)
+
+if(NOT APPLE)
+    enable_language(Fortran)
+    CPMAddPackage(
+        NAME openblas
+        GITHUB_REPOSITORY "OpenMathLib/OpenBLAS"
+        GIT_TAG v0.3.29
+        EXCLUDE_FROM_ALL YES
+        OPTIONS "DYNAMIC_ARCH ON"
+                "BUILD_TESTING OFF"
+                "BUILD_BENCHMARKS OFF"
+                "NUM_THREADS 64"
+                "CCOMMON_OPT -O3")
+endif()
+
+CPMAddPackage(
+    NAME highway
+    GITHUB_REPOSITORY "google/highway"
+    GIT_TAG 1.2.0
+    EXCLUDE_FROM_ALL YES
+    OPTIONS "HWY_ENABLE_CONTRIB ON"
+            "HWY_ENABLE_EXAMPLES OFF"
+            "HWY_ENABLE_INSTALL OFF"
+            "HWY_ENABLE_TESTS OFF"
+            "BUILD_TESTING OFF")
--- a/codon/app/main.cpp
+++ b/codon/app/main.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include <algorithm>
 #include <cstdio>
@ -11,6 +11,7 @@
 #include <unordered_map>
 #include <vector>

+#include "codon/cir/util/format.h"
 #include "codon/compiler/compiler.h"
 #include "codon/compiler/error.h"
 #include "codon/compiler/jit.h"
@ -87,7 +88,7 @@ void initLogFlags(const llvm::cl::opt<std::string> &log) {
    codon::getLogger().parse(std::string(d));
 }

-enum BuildKind { LLVM, Bitcode, Object, Executable, Library, PyExtension, Detect };
+enum BuildKind { LLVM, Bitcode, Object, Executable, Library, PyExtension, Detect, CIR };
 enum OptMode { Debug, Release };
 enum Numerics { C, Python };
 } // namespace
@ -121,7 +122,8 @@ int docMode(const std::vector<const char *> &args, const std::string &argv0) {
    }
  };

-  collectPaths(args[1]);
+  if (args.size() > 1)
+    collectPaths(args[1]);
  auto compiler = std::make_unique<codon::Compiler>(args[0]);
  bool failed = false;
  auto result = compiler->docgen(files);
@ -332,6 +334,7 @@ int buildMode(const std::vector<const char *> &args, const std::string &argv0) {
          clEnumValN(Executable, "exe", "Generate executable"),
          clEnumValN(Library, "lib", "Generate shared library"),
          clEnumValN(PyExtension, "pyext", "Generate Python extension module"),
+          clEnumValN(CIR, "cir", "Generate Codon Intermediate Representation"),
          clEnumValN(Detect, "detect",
                     "Detect output type based on output file extension")),
      llvm::cl::init(Detect));
@ -371,6 +374,9 @@ int buildMode(const std::vector<const char *> &args, const std::string &argv0) {
  case BuildKind::Detect:
    extension = "";
    break;
+  case BuildKind::CIR:
+    extension = ".cir";
+    break;
  default:
    seqassertn(0, "unknown build kind");
  }
@ -400,6 +406,11 @@ int buildMode(const std::vector<const char *> &args, const std::string &argv0) {
    compiler->getLLVMVisitor()->writeToPythonExtension(*compiler->getCache()->pyModule,
                                                       filename);
    break;
+  case BuildKind::CIR: {
+    std::ofstream out(filename);
+    codon::ir::util::format(out, compiler->getModule());
+    break;
+  }
  case BuildKind::Detect:
    compiler->getLLVMVisitor()->compile(filename, argv0, libsVec, lflags);
    break;
--- a/codon/cir/analyze/analysis.cpp
+++ b/codon/cir/analyze/analysis.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "analysis.h"

--- a/codon/cir/analyze/analysis.h
+++ b/codon/cir/analyze/analysis.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/analyze/dataflow/capture.cpp
+++ b/codon/cir/analyze/dataflow/capture.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "capture.h"

--- a/codon/cir/analyze/dataflow/capture.h
+++ b/codon/cir/analyze/dataflow/capture.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/analyze/dataflow/cfg.cpp
+++ b/codon/cir/analyze/dataflow/cfg.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "cfg.h"

--- a/codon/cir/analyze/dataflow/cfg.h
+++ b/codon/cir/analyze/dataflow/cfg.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/analyze/dataflow/dominator.cpp
+++ b/codon/cir/analyze/dataflow/dominator.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "dominator.h"

--- a/codon/cir/analyze/dataflow/dominator.h
+++ b/codon/cir/analyze/dataflow/dominator.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/analyze/dataflow/reaching.cpp
+++ b/codon/cir/analyze/dataflow/reaching.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "reaching.h"

--- a/codon/cir/analyze/dataflow/reaching.h
+++ b/codon/cir/analyze/dataflow/reaching.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/analyze/module/global_vars.cpp
+++ b/codon/cir/analyze/module/global_vars.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "global_vars.h"

--- a/codon/cir/analyze/module/global_vars.h
+++ b/codon/cir/analyze/module/global_vars.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/analyze/module/side_effect.cpp
+++ b/codon/cir/analyze/module/side_effect.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "side_effect.h"

@ -293,7 +293,7 @@ struct SideEfectAnalyzer : public util::ConstVisitor {
  }

  void visit(const CallInstr *v) override {
-    auto s = Status::PURE;
+    auto s = process(v->getCallee());
    auto callStatus = Status::UNKNOWN;
    for (auto *x : *v) {
      s = max(s, process(x));
@ -303,7 +303,6 @@ struct SideEfectAnalyzer : public util::ConstVisitor {
      s = max(s, callStatus);
    } else {
      // unknown function
-      process(v->getCallee());
      s = Status::UNKNOWN;
    }
    set(v, s, callStatus);
--- a/codon/cir/analyze/module/side_effect.h
+++ b/codon/cir/analyze/module/side_effect.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/attribute.cpp
+++ b/codon/cir/attribute.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "attribute.h"

--- a/codon/cir/attribute.h
+++ b/codon/cir/attribute.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/base.cpp
+++ b/codon/cir/base.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "base.h"

--- a/codon/cir/base.h
+++ b/codon/cir/base.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/cir.h
+++ b/codon/cir/cir.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/const.cpp
+++ b/codon/cir/const.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "const.h"

--- a/codon/cir/const.h
+++ b/codon/cir/const.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/dsl/codegen.h
+++ b/codon/cir/dsl/codegen.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/dsl/nodes.cpp
+++ b/codon/cir/dsl/nodes.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "nodes.h"

--- a/codon/cir/dsl/nodes.h
+++ b/codon/cir/dsl/nodes.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/flow.cpp
+++ b/codon/cir/flow.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "flow.h"

--- a/codon/cir/flow.h
+++ b/codon/cir/flow.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/func.cpp
+++ b/codon/cir/func.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "func.h"

--- a/codon/cir/func.h
+++ b/codon/cir/func.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/instr.cpp
+++ b/codon/cir/instr.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "instr.h"

--- a/codon/cir/instr.h
+++ b/codon/cir/instr.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/llvm/gpu.cpp
+++ b/codon/cir/llvm/gpu.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "gpu.h"

@ -6,6 +6,7 @@
 #include <memory>
 #include <string>

+#include "codon/cir/llvm/optimize.h"
 #include "codon/util/common.h"

 namespace codon {
@ -204,6 +205,139 @@ llvm::Function *makeNoOp(llvm::Function *F) {
 using Codegen =
    std::function<void(llvm::IRBuilder<> &, const std::vector<llvm::Value *> &)>;

+void codegenVectorizedUnaryLoop(llvm::IRBuilder<> &B,
+                                const std::vector<llvm::Value *> &args,
+                                llvm::Function *func) {
+  // Create IR to represent:
+  //   p_in = in
+  //   p_out = out
+  //   for i in range(n):
+  //       *p_out = func(*p_in)
+  //       p_in += is
+  //       p_out += os
+  auto &context = B.getContext();
+  auto *parent = B.GetInsertBlock()->getParent();
+  auto *ty = func->getReturnType();
+  auto *in = args[0];
+  auto *is = args[1];
+  auto *out = args[2];
+  auto *os = args[3];
+  auto *n = args[4];
+
+  auto *loop = llvm::BasicBlock::Create(context, "loop", parent);
+  auto *exit = llvm::BasicBlock::Create(context, "exit", parent);
+
+  auto *pinStore = B.CreateAlloca(B.getPtrTy());
+  auto *poutStore = B.CreateAlloca(B.getPtrTy());
+  auto *idxStore = B.CreateAlloca(B.getInt64Ty());
+
+  // p_in = in
+  B.CreateStore(in, pinStore);
+  // p_out = out
+  B.CreateStore(out, poutStore);
+  // i = 0
+  B.CreateStore(B.getInt64(0), idxStore);
+  // if n > 0: goto loop; else: goto exit
+  B.CreateCondBr(B.CreateICmpSGT(n, B.getInt64(0)), loop, exit);
+
+  // load pointers
+  B.SetInsertPoint(loop);
+  auto *pin = B.CreateLoad(B.getPtrTy(), pinStore);
+  auto *pout = B.CreateLoad(B.getPtrTy(), poutStore);
+
+  // y = func(x)
+  auto *x = B.CreateLoad(ty, pin);
+  auto *y = B.CreateCall(func, x);
+  B.CreateStore(y, pout);
+
+  auto *idx = B.CreateLoad(B.getInt64Ty(), idxStore);
+  // i += 1
+  B.CreateStore(B.CreateAdd(idx, B.getInt64(1)), idxStore);
+  // p_in += is
+  B.CreateStore(B.CreateGEP(B.getInt8Ty(), pin, is), pinStore);
+  // p_out += os
+  B.CreateStore(B.CreateGEP(B.getInt8Ty(), pout, os), poutStore);
+
+  idx = B.CreateLoad(B.getInt64Ty(), idxStore);
+  // if i < n: goto loop; else: goto exit
+  B.CreateCondBr(B.CreateICmpSLT(idx, n), loop, exit);
+
+  B.SetInsertPoint(exit);
+  B.CreateRet(llvm::UndefValue::get(parent->getReturnType()));
+}
+
+void codegenVectorizedBinaryLoop(llvm::IRBuilder<> &B,
+                                 const std::vector<llvm::Value *> &args,
+                                 llvm::Function *func) {
+  // Create IR to represent:
+  //   p_in1 = in1
+  //   p_in2 = in2
+  //   p_out = out
+  //   for i in range(n):
+  //       *p_out = func(*p_in1, *p_in2)
+  //       p_in1 += is1
+  //       p_in2 += is2
+  //       p_out += os
+  auto &context = B.getContext();
+  auto *parent = B.GetInsertBlock()->getParent();
+  auto *ty = func->getReturnType();
+  auto *in1 = args[0];
+  auto *is1 = args[1];
+  auto *in2 = args[2];
+  auto *is2 = args[3];
+  auto *out = args[4];
+  auto *os = args[5];
+  auto *n = args[6];
+
+  auto *loop = llvm::BasicBlock::Create(context, "loop", parent);
+  auto *exit = llvm::BasicBlock::Create(context, "exit", parent);
+
+  auto *pin1Store = B.CreateAlloca(B.getPtrTy());
+  auto *pin2Store = B.CreateAlloca(B.getPtrTy());
+  auto *poutStore = B.CreateAlloca(B.getPtrTy());
+  auto *idxStore = B.CreateAlloca(B.getInt64Ty());
+
+  // p_in1 = in1
+  B.CreateStore(in1, pin1Store);
+  // p_in2 = in2
+  B.CreateStore(in2, pin2Store);
+  // p_out = out
+  B.CreateStore(out, poutStore);
+  // i = 0
+  B.CreateStore(B.getInt64(0), idxStore);
+  // if n > 0: goto loop; else: goto exit
+  B.CreateCondBr(B.CreateICmpSGT(n, B.getInt64(0)), loop, exit);
+
+  // load pointers
+  B.SetInsertPoint(loop);
+  auto *pin1 = B.CreateLoad(B.getPtrTy(), pin1Store);
+  auto *pin2 = B.CreateLoad(B.getPtrTy(), pin2Store);
+  auto *pout = B.CreateLoad(B.getPtrTy(), poutStore);
+
+  // y = func(x1, x2)
+  auto *x1 = B.CreateLoad(ty, pin1);
+  auto *x2 = B.CreateLoad(ty, pin2);
+  auto *y = B.CreateCall(func, {x1, x2});
+  B.CreateStore(y, pout);
+
+  auto *idx = B.CreateLoad(B.getInt64Ty(), idxStore);
+  // i += 1
+  B.CreateStore(B.CreateAdd(idx, B.getInt64(1)), idxStore);
+  // p_in1 += is1
+  B.CreateStore(B.CreateGEP(B.getInt8Ty(), pin1, is1), pin1Store);
+  // p_in2 += is2
+  B.CreateStore(B.CreateGEP(B.getInt8Ty(), pin2, is2), pin2Store);
+  // p_out += os
+  B.CreateStore(B.CreateGEP(B.getInt8Ty(), pout, os), poutStore);
+
+  idx = B.CreateLoad(B.getInt64Ty(), idxStore);
+  // if i < n: goto loop; else: goto exit
+  B.CreateCondBr(B.CreateICmpSLT(idx, n), loop, exit);
+
+  B.SetInsertPoint(exit);
+  B.CreateRet(llvm::UndefValue::get(parent->getReturnType()));
+}
+
 llvm::Function *makeFillIn(llvm::Function *F, Codegen codegen) {
  auto *M = F->getParent();
  auto &context = M->getContext();
@ -346,6 +480,13 @@ void remapFunctions(llvm::Module *M) {
         B.CreateRet(mem);
       }},

+      {"seq_alloc_uncollectable",
+       [](llvm::IRBuilder<> &B, const std::vector<llvm::Value *> &args) {
+         auto *M = B.GetInsertBlock()->getModule();
+         llvm::Value *mem = B.CreateCall(makeMalloc(M), args[0]);
+         B.CreateRet(mem);
+       }},
+
      {"seq_alloc_atomic",
       [](llvm::IRBuilder<> &B, const std::vector<llvm::Value *> &args) {
         auto *M = B.GetInsertBlock()->getModule();
@ -353,6 +494,13 @@ void remapFunctions(llvm::Module *M) {
         B.CreateRet(mem);
       }},

+      {"seq_alloc_atomic_uncollectable",
+       [](llvm::IRBuilder<> &B, const std::vector<llvm::Value *> &args) {
+         auto *M = B.GetInsertBlock()->getModule();
+         llvm::Value *mem = B.CreateCall(makeMalloc(M), args[0]);
+         B.CreateRet(mem);
+       }},
+
      {"seq_realloc",
       [](llvm::IRBuilder<> &B, const std::vector<llvm::Value *> &args) {
         auto *M = B.GetInsertBlock()->getModule();
@ -396,6 +544,93 @@ void remapFunctions(llvm::Module *M) {
       [](llvm::IRBuilder<> &B, const std::vector<llvm::Value *> &args) {
         B.CreateUnreachable();
       }},
+
+#define FILLIN_VECLOOP_UNARY32(loop, func)                                             \
+  {                                                                                    \
+    loop, [](llvm::IRBuilder<> &B, const std::vector<llvm::Value *> &args) {           \
+      auto *M = B.GetInsertBlock()->getModule();                                       \
+      auto f = llvm::cast<llvm::Function>(                                             \
+          M->getOrInsertFunction(func, B.getFloatTy(), B.getFloatTy()).getCallee());   \
+      f->setWillReturn();                                                              \
+      codegenVectorizedUnaryLoop(B, args, f);                                          \
+    }                                                                                  \
+  }
+
+#define FILLIN_VECLOOP_UNARY64(loop, func)                                             \
+  {                                                                                    \
+    loop, [](llvm::IRBuilder<> &B, const std::vector<llvm::Value *> &args) {           \
+      auto *M = B.GetInsertBlock()->getModule();                                       \
+      auto f = llvm::cast<llvm::Function>(                                             \
+          M->getOrInsertFunction(func, B.getDoubleTy(), B.getDoubleTy()).getCallee()); \
+      f->setWillReturn();                                                              \
+      codegenVectorizedUnaryLoop(B, args, f);                                          \
+    }                                                                                  \
+  }
+
+#define FILLIN_VECLOOP_BINARY32(loop, func)                                            \
+  {                                                                                    \
+    loop, [](llvm::IRBuilder<> &B, const std::vector<llvm::Value *> &args) {           \
+      auto *M = B.GetInsertBlock()->getModule();                                       \
+      auto f = llvm::cast<llvm::Function>(                                             \
+          M->getOrInsertFunction(func, B.getFloatTy(), B.getFloatTy(), B.getFloatTy()) \
+              .getCallee());                                                           \
+      f->setWillReturn();                                                              \
+      codegenVectorizedBinaryLoop(B, args, f);                                         \
+    }                                                                                  \
+  }
+
+#define FILLIN_VECLOOP_BINARY64(loop, func)                                            \
+  {                                                                                    \
+    loop, [](llvm::IRBuilder<> &B, const std::vector<llvm::Value *> &args) {           \
+      auto *M = B.GetInsertBlock()->getModule();                                       \
+      auto f = llvm::cast<llvm::Function>(                                             \
+          M->getOrInsertFunction(func, B.getDoubleTy(), B.getDoubleTy(),               \
+                                 B.getDoubleTy())                                      \
+              .getCallee());                                                           \
+      f->setWillReturn();                                                              \
+      codegenVectorizedBinaryLoop(B, args, f);                                         \
+    }                                                                                  \
+  }
+
+      FILLIN_VECLOOP_UNARY64("cnp_acos_float64", "__nv_acos"),
+      FILLIN_VECLOOP_UNARY64("cnp_acosh_float64", "__nv_acosh"),
+      FILLIN_VECLOOP_UNARY64("cnp_asin_float64", "__nv_asin"),
+      FILLIN_VECLOOP_UNARY64("cnp_asinh_float64", "__nv_asinh"),
+      FILLIN_VECLOOP_UNARY64("cnp_atan_float64", "__nv_atan"),
+      FILLIN_VECLOOP_UNARY64("cnp_atanh_float64", "__nv_atanh"),
+      FILLIN_VECLOOP_BINARY64("cnp_atan2_float64", "__nv_atan2"),
+      FILLIN_VECLOOP_UNARY64("cnp_exp_float64", "__nv_exp"),
+      FILLIN_VECLOOP_UNARY64("cnp_exp2_float64", "__nv_exp2"),
+      FILLIN_VECLOOP_UNARY64("cnp_expm1_float64", "__nv_expm1"),
+      FILLIN_VECLOOP_UNARY64("cnp_log_float64", "__nv_log"),
+      FILLIN_VECLOOP_UNARY64("cnp_log10_float64", "__nv_log10"),
+      FILLIN_VECLOOP_UNARY64("cnp_log1p_float64", "__nv_log1p"),
+      FILLIN_VECLOOP_UNARY64("cnp_log2_float64", "__nv_log2"),
+      FILLIN_VECLOOP_UNARY64("cnp_sin_float64", "__nv_sin"),
+      FILLIN_VECLOOP_UNARY64("cnp_sinh_float64", "__nv_sinh"),
+      FILLIN_VECLOOP_UNARY64("cnp_tan_float64", "__nv_tan"),
+      FILLIN_VECLOOP_UNARY64("cnp_tanh_float64", "__nv_tanh"),
+      FILLIN_VECLOOP_BINARY64("cnp_hypot_float64", "__nv_hypot"),
+
+      FILLIN_VECLOOP_UNARY32("cnp_acos_float32", "__nv_acosf"),
+      FILLIN_VECLOOP_UNARY32("cnp_acosh_float32", "__nv_acoshf"),
+      FILLIN_VECLOOP_UNARY32("cnp_asin_float32", "__nv_asinf"),
+      FILLIN_VECLOOP_UNARY32("cnp_asinh_float32", "__nv_asinhf"),
+      FILLIN_VECLOOP_UNARY32("cnp_atan_float32", "__nv_atanf"),
+      FILLIN_VECLOOP_UNARY32("cnp_atanh_float32", "__nv_atanhf"),
+      FILLIN_VECLOOP_BINARY32("cnp_atan2_float32", "__nv_atan2f"),
+      FILLIN_VECLOOP_UNARY32("cnp_exp_float32", "__nv_expf"),
+      FILLIN_VECLOOP_UNARY32("cnp_exp2_float32", "__nv_exp2f"),
+      FILLIN_VECLOOP_UNARY32("cnp_expm1_float32", "__nv_expm1f"),
+      FILLIN_VECLOOP_UNARY32("cnp_log_float32", "__nv_logf"),
+      FILLIN_VECLOOP_UNARY32("cnp_log10_float32", "__nv_log10f"),
+      FILLIN_VECLOOP_UNARY32("cnp_log1p_float32", "__nv_log1pf"),
+      FILLIN_VECLOOP_UNARY32("cnp_log2_float32", "__nv_log2f"),
+      FILLIN_VECLOOP_UNARY32("cnp_sin_float32", "__nv_sinf"),
+      FILLIN_VECLOOP_UNARY32("cnp_sinh_float32", "__nv_sinhf"),
+      FILLIN_VECLOOP_UNARY32("cnp_tan_float32", "__nv_tanf"),
+      FILLIN_VECLOOP_UNARY32("cnp_tanh_float32", "__nv_tanhf"),
+      FILLIN_VECLOOP_BINARY32("cnp_hypot_float32", "__nv_hypotf"),
  };

  for (auto &pair : remapping) {
@ -636,6 +871,11 @@ void applyGPUTransformations(llvm::Module *M, const std::string &ptxFilename) {
  clone->setTargetTriple(llvm::Triple::normalize(GPU_TRIPLE));
  clone->setDataLayout(GPU_DL);

+  if (isFastMathOn()) {
+    clone->addModuleFlag(llvm::Module::ModFlagBehavior::Override, "nvvm-reflect-ftz",
+                         1);
+  }
+
  llvm::NamedMDNode *nvvmAnno = clone->getOrInsertNamedMetadata("nvvm.annotations");
  std::vector<llvm::GlobalValue *> kernels;

--- a/codon/cir/llvm/gpu.h
+++ b/codon/cir/llvm/gpu.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/llvm/llvisitor.cpp
+++ b/codon/cir/llvm/llvisitor.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "llvisitor.h"

--- a/codon/cir/llvm/llvisitor.h
+++ b/codon/cir/llvm/llvisitor.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/llvm/llvm.h
+++ b/codon/cir/llvm/llvm.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/llvm/native/native.cpp
+++ b/codon/cir/llvm/native/native.cpp
@ -0,0 +1,115 @@
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>
+
+#include "native.h"
+
+#include "codon/cir/llvm/llvm.h"
+#include "codon/cir/llvm/native/targets/aarch64.h"
+#include "codon/cir/llvm/native/targets/x86.h"
+
+namespace codon {
+namespace ir {
+namespace {
+std::unique_ptr<Target> getNativeTarget(const llvm::Triple &triple) {
+  std::unique_ptr<Target> result = std::unique_ptr<Target>();
+  switch (triple.getArch()) {
+  default:
+    break;
+  case llvm::Triple::mips:
+  case llvm::Triple::mipsel:
+  case llvm::Triple::mips64:
+  case llvm::Triple::mips64el:
+    // nothing
+    break;
+
+  case llvm::Triple::arm:
+  case llvm::Triple::armeb:
+  case llvm::Triple::thumb:
+  case llvm::Triple::thumbeb:
+    // nothing
+    break;
+
+  case llvm::Triple::ppc:
+  case llvm::Triple::ppcle:
+  case llvm::Triple::ppc64:
+  case llvm::Triple::ppc64le:
+    // nothing
+    break;
+  case llvm::Triple::riscv32:
+  case llvm::Triple::riscv64:
+    // nothing
+    break;
+  case llvm::Triple::systemz:
+    // nothing
+    break;
+  case llvm::Triple::aarch64:
+  case llvm::Triple::aarch64_32:
+  case llvm::Triple::aarch64_be:
+    result = std::make_unique<Aarch64>();
+    break;
+  case llvm::Triple::x86:
+  case llvm::Triple::x86_64:
+    result = std::make_unique<X86>();
+    break;
+  case llvm::Triple::hexagon:
+    // nothing
+    break;
+  case llvm::Triple::wasm32:
+  case llvm::Triple::wasm64:
+    // nothing
+    break;
+  case llvm::Triple::sparc:
+  case llvm::Triple::sparcel:
+  case llvm::Triple::sparcv9:
+    // nothing
+    break;
+  case llvm::Triple::r600:
+  case llvm::Triple::amdgcn:
+    // nothing
+    break;
+  case llvm::Triple::msp430:
+    // nothing
+    break;
+  case llvm::Triple::ve:
+    // nothing
+    break;
+  }
+  return result;
+}
+
+class ArchNativePass : public llvm::PassInfoMixin<ArchNativePass> {
+private:
+  std::string cpu;
+  std::string features;
+
+public:
+  explicit ArchNativePass(const std::string &cpu = "", const std::string &features = "")
+      : cpu(cpu), features(features) {}
+
+  llvm::PreservedAnalyses run(llvm::Function &F, llvm::FunctionAnalysisManager &) {
+    if (!cpu.empty())
+      F.addFnAttr("target-cpu", cpu);
+    if (!features.empty())
+      F.addFnAttr("target-features", features);
+    F.addFnAttr("frame-pointer", "none");
+    return llvm::PreservedAnalyses::all();
+  }
+};
+} // namespace
+
+void addNativeLLVMPasses(llvm::PassBuilder *pb) {
+  llvm::Triple triple = llvm::EngineBuilder().selectTarget()->getTargetTriple();
+  auto target = getNativeTarget(triple);
+  if (!target)
+    return;
+  std::string cpu = target->getCPU(triple);
+  std::string features = target->getFeatures(triple);
+
+  pb->registerPipelineEarlySimplificationEPCallback(
+      [cpu, features](llvm::ModulePassManager &pm, llvm::OptimizationLevel opt) {
+        pm.addPass(
+            llvm::createModuleToFunctionPassAdaptor(ArchNativePass(cpu, features)));
+      });
+}
+
+} // namespace ir
+} // namespace codon
--- a/codon/cir/llvm/native/native.h
+++ b/codon/cir/llvm/native/native.h
@ -0,0 +1,13 @@
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>
+
+#pragma once
+
+#include "codon/cir/llvm/llvm.h"
+
+namespace codon {
+namespace ir {
+
+void addNativeLLVMPasses(llvm::PassBuilder *pb);
+
+} // namespace ir
+} // namespace codon
--- a/codon/cir/llvm/native/targets/aarch64.cpp
+++ b/codon/cir/llvm/native/targets/aarch64.cpp
@ -0,0 +1,162 @@
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>
+
+#include "aarch64.h"
+
+#include "llvm/TargetParser/AArch64TargetParser.h"
+
+namespace codon {
+namespace ir {
+namespace {
+template <typename T> std::string join(const T &v, const std::string &delim = ",") {
+  std::ostringstream s;
+  for (const auto &i : v) {
+    if (&i != &v[0])
+      s << delim;
+    s << std::string(i);
+  }
+  return s.str();
+}
+} // namespace
+
+std::string Aarch64::getCPU(const llvm::Triple &triple) const {
+  return llvm::sys::getHostCPUName().str();
+}
+
+std::string Aarch64::getFeatures(const llvm::Triple &triple) const {
+  std::vector<llvm::StringRef> features;
+  // Enable NEON by default.
+  features.push_back("+neon");
+
+  std::string cpu(llvm::sys::getHostCPUName());
+  const std::optional<llvm::AArch64::CpuInfo> cpuInfo = llvm::AArch64::parseCpu(cpu);
+  if (!cpuInfo)
+    return "";
+
+  if (cpu == "cyclone" || llvm::StringRef(cpu).startswith("apple")) {
+    features.push_back("+zcm");
+    features.push_back("+zcz");
+  }
+
+  auto *archInfo = &cpuInfo->Arch;
+  features.push_back(archInfo->ArchFeature);
+  uint64_t extension = cpuInfo->getImpliedExtensions();
+  if (!llvm::AArch64::getExtensionFeatures(extension, features))
+    return "";
+
+  // Handle (arch-dependent) fp16fml/fullfp16 relationship.
+  // FIXME: this fp16fml option handling will be reimplemented after the
+  // TargetParser rewrite.
+  const auto ItRNoFullFP16 = std::find(features.rbegin(), features.rend(), "-fullfp16");
+  const auto ItRFP16FML = std::find(features.rbegin(), features.rend(), "+fp16fml");
+  if (llvm::is_contained(features, "+v8.4a")) {
+    const auto ItRFullFP16 = std::find(features.rbegin(), features.rend(), "+fullfp16");
+    if (ItRFullFP16 < ItRNoFullFP16 && ItRFullFP16 < ItRFP16FML) {
+      // Only entangled feature that can be to the right of this +fullfp16 is -fp16fml.
+      // Only append the +fp16fml if there is no -fp16fml after the +fullfp16.
+      if (std::find(features.rbegin(), ItRFullFP16, "-fp16fml") == ItRFullFP16)
+        features.push_back("+fp16fml");
+    } else
+      goto fp16_fml_fallthrough;
+  } else {
+  fp16_fml_fallthrough:
+    // In both of these cases, putting the 'other' feature on the end of the vector will
+    // result in the same effect as placing it immediately after the current feature.
+    if (ItRNoFullFP16 < ItRFP16FML)
+      features.push_back("-fp16fml");
+    else if (ItRNoFullFP16 > ItRFP16FML)
+      features.push_back("+fullfp16");
+  }
+
+  // FIXME: this needs reimplementation too after the TargetParser rewrite
+  //
+  // Context sensitive meaning of Crypto:
+  // 1) For Arch >= ARMv8.4a:  crypto = sm4 + sha3 + sha2 + aes
+  // 2) For Arch <= ARMv8.3a:  crypto = sha2 + aes
+  const auto ItBegin = features.begin();
+  const auto ItEnd = features.end();
+  const auto ItRBegin = features.rbegin();
+  const auto ItREnd = features.rend();
+  const auto ItRCrypto = std::find(ItRBegin, ItREnd, "+crypto");
+  const auto ItRNoCrypto = std::find(ItRBegin, ItREnd, "-crypto");
+  const auto HasCrypto = ItRCrypto != ItREnd;
+  const auto HasNoCrypto = ItRNoCrypto != ItREnd;
+  const ptrdiff_t PosCrypto = ItRCrypto - ItRBegin;
+  const ptrdiff_t PosNoCrypto = ItRNoCrypto - ItRBegin;
+
+  bool NoCrypto = false;
+  if (HasCrypto && HasNoCrypto) {
+    if (PosNoCrypto < PosCrypto)
+      NoCrypto = true;
+  }
+
+  if (std::find(ItBegin, ItEnd, "+v8.4a") != ItEnd) {
+    if (HasCrypto && !NoCrypto) {
+      // Check if we have NOT disabled an algorithm with something like:
+      //   +crypto, -algorithm
+      // And if "-algorithm" does not occur, we enable that crypto algorithm.
+      const bool HasSM4 = (std::find(ItBegin, ItEnd, "-sm4") == ItEnd);
+      const bool HasSHA3 = (std::find(ItBegin, ItEnd, "-sha3") == ItEnd);
+      const bool HasSHA2 = (std::find(ItBegin, ItEnd, "-sha2") == ItEnd);
+      const bool HasAES = (std::find(ItBegin, ItEnd, "-aes") == ItEnd);
+      if (HasSM4)
+        features.push_back("+sm4");
+      if (HasSHA3)
+        features.push_back("+sha3");
+      if (HasSHA2)
+        features.push_back("+sha2");
+      if (HasAES)
+        features.push_back("+aes");
+    } else if (HasNoCrypto) {
+      // Check if we have NOT enabled a crypto algorithm with something like:
+      //   -crypto, +algorithm
+      // And if "+algorithm" does not occur, we disable that crypto algorithm.
+      const bool HasSM4 = (std::find(ItBegin, ItEnd, "+sm4") != ItEnd);
+      const bool HasSHA3 = (std::find(ItBegin, ItEnd, "+sha3") != ItEnd);
+      const bool HasSHA2 = (std::find(ItBegin, ItEnd, "+sha2") != ItEnd);
+      const bool HasAES = (std::find(ItBegin, ItEnd, "+aes") != ItEnd);
+      if (!HasSM4)
+        features.push_back("-sm4");
+      if (!HasSHA3)
+        features.push_back("-sha3");
+      if (!HasSHA2)
+        features.push_back("-sha2");
+      if (!HasAES)
+        features.push_back("-aes");
+    }
+  } else {
+    if (HasCrypto && !NoCrypto) {
+      const bool HasSHA2 = (std::find(ItBegin, ItEnd, "-sha2") == ItEnd);
+      const bool HasAES = (std::find(ItBegin, ItEnd, "-aes") == ItEnd);
+      if (HasSHA2)
+        features.push_back("+sha2");
+      if (HasAES)
+        features.push_back("+aes");
+    } else if (HasNoCrypto) {
+      const bool HasSHA2 = (std::find(ItBegin, ItEnd, "+sha2") != ItEnd);
+      const bool HasAES = (std::find(ItBegin, ItEnd, "+aes") != ItEnd);
+      const bool HasV82a = (std::find(ItBegin, ItEnd, "+v8.2a") != ItEnd);
+      const bool HasV83a = (std::find(ItBegin, ItEnd, "+v8.3a") != ItEnd);
+      const bool HasV84a = (std::find(ItBegin, ItEnd, "+v8.4a") != ItEnd);
+      if (!HasSHA2)
+        features.push_back("-sha2");
+      if (!HasAES)
+        features.push_back("-aes");
+      if (HasV82a || HasV83a || HasV84a) {
+        features.push_back("-sm4");
+        features.push_back("-sha3");
+      }
+    }
+  }
+
+  auto V8_6Pos = llvm::find(features, "+v8.6a");
+  if (V8_6Pos != std::end(features))
+    V8_6Pos = features.insert(std::next(V8_6Pos), {"+i8mm", "+bf16"});
+
+  if (triple.isOSOpenBSD())
+    features.push_back("+strict-align");
+
+  return join(features);
+}
+
+} // namespace ir
+} // namespace codon
--- a/codon/cir/llvm/native/targets/aarch64.h
+++ b/codon/cir/llvm/native/targets/aarch64.h
@ -0,0 +1,17 @@
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>
+
+#pragma once
+
+#include "codon/cir/llvm/native/targets/target.h"
+
+namespace codon {
+namespace ir {
+
+class Aarch64 : public Target {
+public:
+  std::string getCPU(const llvm::Triple &triple) const override;
+  std::string getFeatures(const llvm::Triple &triple) const override;
+};
+
+} // namespace ir
+} // namespace codon
--- a/codon/cir/llvm/native/targets/target.h
+++ b/codon/cir/llvm/native/targets/target.h
@ -0,0 +1,21 @@
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>
+
+#pragma once
+
+#include <sstream>
+#include <string>
+
+#include "codon/cir/llvm/llvm.h"
+
+namespace codon {
+namespace ir {
+
+class Target {
+public:
+  virtual ~Target() {}
+  virtual std::string getCPU(const llvm::Triple &triple) const = 0;
+  virtual std::string getFeatures(const llvm::Triple &triple) const = 0;
+};
+
+} // namespace ir
+} // namespace codon
--- a/codon/cir/llvm/native/targets/x86.cpp
+++ b/codon/cir/llvm/native/targets/x86.cpp
@ -0,0 +1,108 @@
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>
+
+#include "x86.h"
+
+namespace codon {
+namespace ir {
+namespace {
+template <typename T> std::string join(const T &v, const std::string &delim = ",") {
+  std::ostringstream s;
+  for (const auto &i : v) {
+    if (&i != &v[0])
+      s << delim;
+    s << std::string(i);
+  }
+  return s.str();
+}
+} // namespace
+
+std::string X86::getCPU(const llvm::Triple &triple) const {
+  auto CPU = llvm::sys::getHostCPUName();
+  if (!CPU.empty() && CPU != "generic")
+    return std::string(CPU);
+
+  // Select the default CPU if none was given (or detection failed).
+
+  if (!triple.isX86())
+    return ""; // This routine is only handling x86 targets.
+
+  bool is64Bit = triple.getArch() == llvm::Triple::x86_64;
+
+  // FIXME: Need target hooks.
+  if (triple.isOSDarwin()) {
+    if (triple.getArchName() == "x86_64h")
+      return "core-avx2";
+    // macosx10.12 drops support for all pre-Penryn Macs.
+    // Simulators can still run on 10.11 though, like Xcode.
+    if (triple.isMacOSX() && !triple.isOSVersionLT(10, 12))
+      return "penryn";
+
+    if (triple.isDriverKit())
+      return "nehalem";
+
+    // The oldest x86_64 Macs have core2/Merom; the oldest x86 Macs have Yonah.
+    return is64Bit ? "core2" : "yonah";
+  }
+
+  // Set up default CPU name for PS4/PS5 compilers.
+  if (triple.isPS4())
+    return "btver2";
+  if (triple.isPS5())
+    return "znver2";
+
+  // On Android use targets compatible with gcc
+  if (triple.isAndroid())
+    return is64Bit ? "x86-64" : "i686";
+
+  // Everything else goes to x86-64 in 64-bit mode.
+  if (is64Bit)
+    return "x86-64";
+
+  switch (triple.getOS()) {
+  case llvm::Triple::NetBSD:
+    return "i486";
+  case llvm::Triple::Haiku:
+  case llvm::Triple::OpenBSD:
+    return "i586";
+  case llvm::Triple::FreeBSD:
+    return "i686";
+  default:
+    // Fallback to p4.
+    return "pentium4";
+  }
+}
+
+std::string X86::getFeatures(const llvm::Triple &triple) const {
+  std::vector<std::string> features;
+  llvm::StringMap<bool> hostFeatures;
+  if (llvm::sys::getHostCPUFeatures(hostFeatures)) {
+    for (auto &f : hostFeatures) {
+      features.push_back((f.second ? "+" : "-") + f.first().str());
+    }
+  }
+
+  if (triple.getArchName() == "x86_64h") {
+    // x86_64h implies quite a few of the more modern subtarget features
+    // for Haswell class CPUs, but not all of them. Opt-out of a few.
+    features.push_back("-rdrnd");
+    features.push_back("-aes");
+    features.push_back("-pclmul");
+    features.push_back("-rtm");
+    features.push_back("-fsgsbase");
+  }
+
+  const llvm::Triple::ArchType ArchType = triple.getArch();
+  // Add features to be compatible with gcc for Android.
+  if (triple.isAndroid()) {
+    if (ArchType == llvm::Triple::x86_64) {
+      features.push_back("+sse4.2");
+      features.push_back("+popcnt");
+      features.push_back("+cx16");
+    } else
+      features.push_back("+ssse3");
+  }
+  return join(features);
+}
+
+} // namespace ir
+} // namespace codon
--- a/codon/cir/llvm/native/targets/x86.h
+++ b/codon/cir/llvm/native/targets/x86.h
@ -0,0 +1,17 @@
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>
+
+#pragma once
+
+#include "codon/cir/llvm/native/targets/target.h"
+
+namespace codon {
+namespace ir {
+
+class X86 : public Target {
+public:
+  std::string getCPU(const llvm::Triple &triple) const override;
+  std::string getFeatures(const llvm::Triple &triple) const override;
+};
+
+} // namespace ir
+} // namespace codon
--- a/codon/cir/llvm/optimize.cpp
+++ b/codon/cir/llvm/optimize.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "optimize.h"

@ -6,12 +6,23 @@
 #include <deque>

 #include "codon/cir/llvm/gpu.h"
+#include "codon/cir/llvm/native/native.h"
 #include "codon/util/common.h"

 static llvm::codegen::RegisterCodeGenFlags CFG;

 namespace codon {
 namespace ir {
+namespace {
+llvm::cl::opt<bool>
+    AutoFree("auto-free",
+             llvm::cl::desc("Insert free() calls on allocated memory automatically"),
+             llvm::cl::init(false), llvm::cl::Hidden);
+
+llvm::cl::opt<bool> FastMath("fast-math",
+                             llvm::cl::desc("Apply fastmath optimizations"),
+                             llvm::cl::init(false));
+} // namespace

 std::unique_ptr<llvm::TargetMachine>
 getTargetMachine(llvm::Triple triple, llvm::StringRef cpuStr,
@ -77,6 +88,27 @@ void applyDebugTransformations(llvm::Module *module, bool debug, bool jit) {
  }
 }

+void applyFastMathTransformations(llvm::Module *module) {
+  if (!FastMath)
+    return;
+
+  for (auto &f : *module) {
+    for (auto &block : f) {
+      for (auto &inst : block) {
+        if (auto *binop = llvm::dyn_cast<llvm::BinaryOperator>(&inst)) {
+          if (binop->getType()->isFloatingPointTy())
+            binop->setFast(true);
+        }
+
+        if (auto *intrinsic = llvm::dyn_cast<llvm::IntrinsicInst>(&inst)) {
+          if (intrinsic->getType()->isFloatingPointTy())
+            intrinsic->setFast(true);
+        }
+      }
+    }
+  }
+}
+
 struct AllocInfo {
  std::vector<std::string> allocators;
  std::string realloc;
@ -751,6 +783,136 @@ struct AllocationHoister : public llvm::PassInfoMixin<AllocationHoister> {
  }
 };

+struct AllocationAutoFree : public llvm::PassInfoMixin<AllocationAutoFree> {
+  AllocInfo info;
+
+  explicit AllocationAutoFree(
+      std::vector<std::string> allocators = {"seq_alloc", "seq_alloc_atomic",
+                                             "seq_alloc_uncollectable",
+                                             "seq_alloc_atomic_uncollectable"},
+      const std::string &realloc = "seq_realloc", const std::string &free = "seq_free")
+      : info(std::move(allocators), realloc, free) {}
+
+  llvm::PreservedAnalyses run(llvm::Function &F, llvm::FunctionAnalysisManager &FAM) {
+    // Get the necessary analysis results.
+    auto &MSSA = FAM.getResult<llvm::MemorySSAAnalysis>(F);
+    auto &TLI = FAM.getResult<llvm::TargetLibraryAnalysis>(F);
+    auto &AA = FAM.getResult<llvm::AAManager>(F);
+    auto &DT = FAM.getResult<llvm::DominatorTreeAnalysis>(F);
+    auto &PDT = FAM.getResult<llvm::PostDominatorTreeAnalysis>(F);
+    auto &LI = FAM.getResult<llvm::LoopAnalysis>(F);
+    auto &CI = FAM.getResult<llvm::CycleAnalysis>(F);
+    bool Changed = false;
+
+    // Traverse the function to find allocs and insert corresponding frees.
+    for (auto &BB : F) {
+      for (auto &I : BB) {
+        if (auto *Alloc = llvm::dyn_cast<llvm::CallInst>(&I)) {
+          auto *Callee = Alloc->getCalledFunction();
+          if (!Callee || !Callee->isDeclaration())
+            continue;
+
+          if (info.isAlloc(Alloc)) {
+            if (llvm::PointerMayBeCaptured(Alloc, /*ReturnCaptures=*/true,
+                                           /*StoreCaptures=*/true))
+              continue;
+
+            Changed |= insertFree(Alloc, F, DT, PDT, LI, CI);
+          }
+        }
+      }
+    }
+
+    return (Changed ? llvm::PreservedAnalyses::none() : llvm::PreservedAnalyses::all());
+  }
+
+  bool insertFree(llvm::Instruction *Alloc, llvm::Function &F, llvm::DominatorTree &DT,
+                  llvm::PostDominatorTree &PDT, llvm::LoopInfo &LI,
+                  llvm::CycleInfo &CI) {
+    llvm::SmallVector<llvm::Value *, 8> Worklist;
+    llvm::SmallPtrSet<llvm::Value *, 8> Visited;
+    llvm::SmallVector<llvm::BasicBlock *, 8> UseBlocks;
+
+    // We need to find a basic block that:
+    //   1. Post-dominates the allocation block (so we always free it)
+    //   2. Is dominated by the allocation block (so the use is valid)
+    //   3. Post-dominates all uses
+
+    // Start with the original pointer.
+    Worklist.push_back(Alloc);
+    UseBlocks.push_back(Alloc->getParent());
+
+    // Track all blocks where the pointer or its derived values are used.
+    while (!Worklist.empty()) {
+      auto *CurrentPtr = Worklist.pop_back_val();
+      if (!Visited.insert(CurrentPtr).second)
+        continue;
+
+      // Traverse all users of the current pointer.
+      for (auto *U : CurrentPtr->users()) {
+        if (auto *Inst = llvm::dyn_cast<llvm::Instruction>(U)) {
+          if (auto *call = llvm::dyn_cast<llvm::CallBase>(Inst))
+            if (call->getCalledFunction() && info.isFree(call->getCalledFunction()))
+              return false;
+
+          if (llvm::isa<llvm::GetElementPtrInst>(Inst) ||
+              llvm::isa<llvm::BitCastInst>(Inst) || llvm::isa<llvm::PHINode>(Inst) ||
+              llvm::isa<llvm::SelectInst>(Inst)) {
+            Worklist.push_back(Inst);
+          } else {
+            // If this is a real use, record the block.
+            UseBlocks.push_back(Inst->getParent());
+          }
+        }
+      }
+    }
+
+    // Find the closest post-dominating block of all the use blocks.
+    llvm::BasicBlock *PostDomBlock = nullptr;
+    for (auto *BB : UseBlocks) {
+      if (!PostDomBlock) {
+        PostDomBlock = BB;
+      } else {
+        PostDomBlock = PDT.findNearestCommonDominator(PostDomBlock, BB);
+        if (!PostDomBlock) {
+          return false;
+        }
+      }
+    }
+
+    auto *allocLoop = LI.getLoopFor(Alloc->getParent());
+    auto *freeLoop = LI.getLoopFor(PostDomBlock);
+
+    while (allocLoop != freeLoop) {
+      if (!freeLoop)
+        return false;
+      PostDomBlock = freeLoop->getExitBlock();
+      if (!PostDomBlock)
+        return false;
+      freeLoop = LI.getLoopFor(PostDomBlock);
+    }
+
+    if (!DT.dominates(Alloc->getParent(), PostDomBlock)) {
+      return false;
+    }
+
+    llvm::IRBuilder<> B(PostDomBlock->getTerminator());
+    auto *FreeFunc = F.getParent()->getFunction(info.free);
+    if (!FreeFunc) {
+      FreeFunc = llvm::Function::Create(
+          llvm::FunctionType::get(B.getVoidTy(), {B.getPtrTy()}, false),
+          llvm::Function::ExternalLinkage, info.free, F.getParent());
+      FreeFunc->setWillReturn();
+      FreeFunc->setDoesNotThrow();
+    }
+
+    // Add free
+    B.CreateCall(FreeFunc, Alloc);
+
+    return true;
+  }
+};
+
 /// Sometimes coroutine lowering produces hard-to-analyze loops involving
 /// function pointer comparisons. This pass puts them into a somewhat
 /// easier-to-analyze form.
@ -826,9 +988,15 @@ struct CoroBranchSimplifier : public llvm::PassInfoMixin<CoroBranchSimplifier> {
  }
 };

+llvm::cl::opt<bool>
+    DisableNative("disable-native",
+                  llvm::cl::desc("Disable architecture-specific optimizations"),
+                  llvm::cl::init(false));
+
 void runLLVMOptimizationPasses(llvm::Module *module, bool debug, bool jit,
                               PluginManager *plugins) {
  applyDebugTransformations(module, debug, jit);
+  applyFastMathTransformations(module);

  llvm::LoopAnalysisManager lam;
  llvm::FunctionAnalysisManager fam;
@ -860,9 +1028,14 @@ void runLLVMOptimizationPasses(llvm::Module *module, bool debug, bool jit,
          pm.addPass(llvm::LoopSimplifyPass());
          pm.addPass(llvm::LCSSAPass());
          pm.addPass(AllocationHoister());
+          if (AutoFree)
+            pm.addPass(AllocationAutoFree());
        }
      });

+  if (!DisableNative)
+    addNativeLLVMPasses(&pb);
+
  if (plugins) {
    for (auto *plugin : *plugins) {
      plugin->dsl->addLLVMPasses(&pb, debug);
@ -884,7 +1057,15 @@ void runLLVMOptimizationPasses(llvm::Module *module, bool debug, bool jit,

 void verify(llvm::Module *module) {
  const bool broken = llvm::verifyModule(*module, &llvm::errs());
-  seqassertn(!broken, "module broken");
+  if (broken) {
+    auto fo = fopen("_dump.ll", "w");
+    llvm::raw_fd_ostream fout(fileno(fo), true);
+    fout << *module;
+    fout.close();
+  }
+  seqassertn(!broken, "Generated LLVM IR is invalid and has been dumped to '_dump.ll'. "
+                      "Please submit a bug report at https://github.com/exaloop/codon "
+                      "including the code and generated LLVM IR.");
 }

 } // namespace
@ -906,5 +1087,7 @@ void optimize(llvm::Module *module, bool debug, bool jit, PluginManager *plugins
  verify(module);
 }

+bool isFastMathOn() { return FastMath; }
+
 } // namespace ir
 } // namespace codon
--- a/codon/cir/llvm/optimize.h
+++ b/codon/cir/llvm/optimize.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

@ -20,5 +20,7 @@ getTargetMachine(llvm::Module *module, bool setFunctionAttributes = false,

 void optimize(llvm::Module *module, bool debug, bool jit = false,
              PluginManager *plugins = nullptr);
+
+bool isFastMathOn();
 } // namespace ir
 } // namespace codon
--- a/codon/cir/module.cpp
+++ b/codon/cir/module.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "module.h"

--- a/codon/cir/module.h
+++ b/codon/cir/module.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/pyextension.h
+++ b/codon/cir/pyextension.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/transform/cleanup/canonical.cpp
+++ b/codon/cir/transform/cleanup/canonical.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "canonical.h"

--- a/codon/cir/transform/cleanup/canonical.h
+++ b/codon/cir/transform/cleanup/canonical.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/transform/cleanup/dead_code.cpp
+++ b/codon/cir/transform/cleanup/dead_code.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "dead_code.h"

--- a/codon/cir/transform/cleanup/dead_code.h
+++ b/codon/cir/transform/cleanup/dead_code.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/transform/cleanup/global_demote.cpp
+++ b/codon/cir/transform/cleanup/global_demote.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "global_demote.h"

--- a/codon/cir/transform/cleanup/global_demote.h
+++ b/codon/cir/transform/cleanup/global_demote.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/transform/cleanup/replacer.cpp
+++ b/codon/cir/transform/cleanup/replacer.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "replacer.h"

--- a/codon/cir/transform/cleanup/replacer.h
+++ b/codon/cir/transform/cleanup/replacer.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/transform/folding/const_fold.cpp
+++ b/codon/cir/transform/folding/const_fold.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "const_fold.h"

--- a/codon/cir/transform/folding/const_fold.h
+++ b/codon/cir/transform/folding/const_fold.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/transform/folding/const_prop.cpp
+++ b/codon/cir/transform/folding/const_prop.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "const_prop.h"

--- a/codon/cir/transform/folding/const_prop.h
+++ b/codon/cir/transform/folding/const_prop.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/transform/folding/folding.cpp
+++ b/codon/cir/transform/folding/folding.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "folding.h"

--- a/codon/cir/transform/folding/folding.h
+++ b/codon/cir/transform/folding/folding.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/transform/folding/rule.h
+++ b/codon/cir/transform/folding/rule.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/transform/lowering/imperative.cpp
+++ b/codon/cir/transform/lowering/imperative.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "imperative.h"

@ -117,7 +117,7 @@ void ImperativeForFlowLowering::handle(ForFlow *v) {
    //     body
    auto *parent = cast<BodiedFunc>(getParentFunc());
    auto *series = M->N<SeriesFlow>(v->getSrcInfo());
-    auto *listVar = util::makeVar(list, series, parent)->getVar();
+    auto *listVar = util::makeVar(list, series, parent);
    auto *lenVal = M->Nr<ExtractInstr>(M->Nr<VarValue>(listVar), "len");
    auto *lenVar = util::makeVar(lenVal, series, parent);
    auto *ptrVal = M->Nr<ExtractInstr>(
@ -129,12 +129,14 @@ void ImperativeForFlowLowering::handle(ForFlow *v) {
    auto *oldLoopVar = v->getVar();
    auto *newLoopVar = M->Nr<Var>(M->getIntType());
    parent->push_back(newLoopVar);
-    auto *replacement = M->N<ImperativeForFlow>(
-        v->getSrcInfo(), M->getInt(0), 1, lenVar, body, newLoopVar, std::move(sched));
+    auto *replacement = M->N<ImperativeForFlow>(v->getSrcInfo(), M->getInt(0), 1,
+                                                M->Nr<VarValue>(lenVar), body,
+                                                newLoopVar, std::move(sched));
    series->push_back(replacement);
    body->insert(
        body->begin(),
-        M->Nr<AssignInstr>(oldLoopVar, (*ptrVar)[*M->Nr<VarValue>(newLoopVar)]));
+        M->Nr<AssignInstr>(oldLoopVar,
+                           (*M->Nr<VarValue>(ptrVar))[*M->Nr<VarValue>(newLoopVar)]));
    v->replaceAll(series);
  }
 }
--- a/codon/cir/transform/lowering/imperative.h
+++ b/codon/cir/transform/lowering/imperative.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/transform/lowering/pipeline.cpp
+++ b/codon/cir/transform/lowering/pipeline.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "pipeline.h"

--- a/codon/cir/transform/lowering/pipeline.h
+++ b/codon/cir/transform/lowering/pipeline.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/transform/manager.cpp
+++ b/codon/cir/transform/manager.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "manager.h"

@ -15,6 +15,7 @@
 #include "codon/cir/transform/lowering/imperative.h"
 #include "codon/cir/transform/lowering/pipeline.h"
 #include "codon/cir/transform/manager.h"
+#include "codon/cir/transform/numpy/numpy.h"
 #include "codon/cir/transform/parallel/openmp.h"
 #include "codon/cir/transform/pass.h"
 #include "codon/cir/transform/pythonic/dict.h"
@ -196,6 +197,9 @@ void PassManager::registerStandardPasses(PassManager::Init init) {
                     pyNumerics),
                 /*insertBefore=*/"", {seKey1, rdKey, globalKey},
                 {seKey1, rdKey, cfgKey, globalKey, capKey});
+    registerPass(std::make_unique<numpy::NumPyFusionPass>(rdKey, seKey2),
+                 /*insertBefore=*/"", {rdKey, seKey2},
+                 {seKey1, rdKey, cfgKey, globalKey, capKey});

    // parallel
    registerPass(std::make_unique<parallel::OpenMPPass>(), /*insertBefore=*/"", {},
--- a/codon/cir/transform/manager.h
+++ b/codon/cir/transform/manager.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/transform/numpy/expr.cpp
+++ b/codon/cir/transform/numpy/expr.cpp
@ -0,0 +1,982 @@
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>
+
+#include "numpy.h"
+
+#include "codon/cir/util/irtools.h"
+
+namespace codon {
+namespace ir {
+namespace transform {
+namespace numpy {
+namespace {
+types::Type *coerceScalarArray(NumPyType &scalar, NumPyType &array,
+                               NumPyPrimitiveTypes &T) {
+  auto xtype = scalar.dtype;
+  auto atype = array.dtype;
+  bool aIsInt = false;
+  bool xIsInt = false;
+  bool aIsFloat = false;
+  bool xIsFloat = false;
+  bool aIsComplex = false;
+  bool xIsComplex = false;
+
+  switch (atype) {
+  case NumPyType::NP_TYPE_ARR_BOOL:
+    break;
+  case NumPyType::NP_TYPE_ARR_I8:
+  case NumPyType::NP_TYPE_ARR_U8:
+  case NumPyType::NP_TYPE_ARR_I16:
+  case NumPyType::NP_TYPE_ARR_U16:
+  case NumPyType::NP_TYPE_ARR_I32:
+  case NumPyType::NP_TYPE_ARR_U32:
+  case NumPyType::NP_TYPE_ARR_I64:
+  case NumPyType::NP_TYPE_ARR_U64:
+    aIsInt = true;
+    break;
+  case NumPyType::NP_TYPE_ARR_F16:
+  case NumPyType::NP_TYPE_ARR_F32:
+  case NumPyType::NP_TYPE_ARR_F64:
+    aIsFloat = true;
+    break;
+  case NumPyType::NP_TYPE_ARR_C64:
+  case NumPyType::NP_TYPE_ARR_C128:
+    aIsComplex = true;
+    break;
+  default:
+    seqassertn(false, "unexpected type");
+  }
+
+  xIsInt = (xtype == NumPyType::NP_TYPE_BOOL || xtype == NumPyType::NP_TYPE_I64);
+  xIsFloat = (xtype == NumPyType::NP_TYPE_F64);
+  xIsComplex = (xtype == NumPyType::NP_TYPE_C128);
+
+  bool shouldCast =
+      ((xIsInt && (aIsInt || aIsFloat || aIsComplex)) ||
+       (xIsFloat && (aIsFloat || aIsComplex)) || (xIsComplex && aIsComplex));
+
+  if ((atype == NumPyType::NP_TYPE_ARR_F16 || atype == NumPyType::NP_TYPE_ARR_F32) &&
+      xtype == NumPyType::NP_TYPE_C128)
+    return T.c64;
+  else if (shouldCast)
+    return array.getIRBaseType(T);
+  else
+    return scalar.getIRBaseType(T);
+}
+
+template <typename E>
+types::Type *decideTypes(E *expr, NumPyType &lhs, NumPyType &rhs,
+                         NumPyPrimitiveTypes &T) {
+  // Special case(s)
+  if (expr->op == E::NP_OP_COPYSIGN)
+    return expr->type.getIRBaseType(T);
+
+  if (lhs.isArray() && !rhs.isArray())
+    return coerceScalarArray(rhs, lhs, T);
+
+  if (!lhs.isArray() && rhs.isArray())
+    return coerceScalarArray(lhs, rhs, T);
+
+  auto *t1 = lhs.getIRBaseType(T);
+  auto *t2 = rhs.getIRBaseType(T);
+  auto *M = t1->getModule();
+  auto *coerceFunc = M->getOrRealizeFunc("_coerce", {}, {t1, t2}, FUSION_MODULE);
+  seqassertn(coerceFunc, "coerce func not found");
+  return util::getReturnType(coerceFunc);
+}
+} // namespace
+
+void NumPyExpr::replace(NumPyExpr &e) {
+  type = e.type;
+  val = e.val;
+  op = e.op;
+  lhs = std::move(e.lhs);
+  rhs = std::move(e.rhs);
+  freeable = e.freeable;
+
+  e.type = {};
+  e.val = nullptr;
+  e.op = NP_OP_NONE;
+  e.lhs = {};
+  e.rhs = {};
+  e.freeable = false;
+}
+
+bool NumPyExpr::haveVectorizedLoop() const {
+  if (lhs && !(lhs->type.dtype == NumPyType::NP_TYPE_ARR_F32 ||
+               lhs->type.dtype == NumPyType::NP_TYPE_ARR_F64))
+    return false;
+
+  if (rhs && !(rhs->type.dtype == NumPyType::NP_TYPE_ARR_F32 ||
+               rhs->type.dtype == NumPyType::NP_TYPE_ARR_F64))
+    return false;
+
+  if (lhs && rhs && lhs->type.dtype != rhs->type.dtype)
+    return false;
+
+  // These are the loops available in the runtime library.
+  static const std::vector<std::string> VecLoops = {
+      "arccos", "arccosh", "arcsin", "arcsinh", "arctan", "arctanh", "arctan2",
+      "cos",    "exp",     "exp2",   "expm1",   "log",    "log10",   "log1p",
+      "log2",   "sin",     "sinh",   "tanh",    "hypot"};
+  return std::find(VecLoops.begin(), VecLoops.end(), opstring()) != VecLoops.end();
+}
+
+int64_t NumPyExpr::opcost() const {
+  switch (op) {
+  case NP_OP_NONE:
+    return 0;
+  case NP_OP_POS:
+    return 0;
+  case NP_OP_NEG:
+    return 0;
+  case NP_OP_INVERT:
+    return 0;
+  case NP_OP_ABS:
+    return 1;
+  case NP_OP_TRANSPOSE:
+    return 0;
+  case NP_OP_ADD:
+    return 1;
+  case NP_OP_SUB:
+    return 1;
+  case NP_OP_MUL:
+    return 1;
+  case NP_OP_MATMUL:
+    return 20;
+  case NP_OP_TRUE_DIV:
+    return 8;
+  case NP_OP_FLOOR_DIV:
+    return 8;
+  case NP_OP_MOD:
+    return 8;
+  case NP_OP_FMOD:
+    return 8;
+  case NP_OP_POW:
+    return 8;
+  case NP_OP_LSHIFT:
+    return 1;
+  case NP_OP_RSHIFT:
+    return 1;
+  case NP_OP_AND:
+    return 1;
+  case NP_OP_OR:
+    return 1;
+  case NP_OP_XOR:
+    return 1;
+  case NP_OP_LOGICAL_AND:
+    return 1;
+  case NP_OP_LOGICAL_OR:
+    return 1;
+  case NP_OP_LOGICAL_XOR:
+    return 1;
+  case NP_OP_EQ:
+    return 1;
+  case NP_OP_NE:
+    return 1;
+  case NP_OP_LT:
+    return 1;
+  case NP_OP_LE:
+    return 1;
+  case NP_OP_GT:
+    return 1;
+  case NP_OP_GE:
+    return 1;
+  case NP_OP_MIN:
+    return 3;
+  case NP_OP_MAX:
+    return 3;
+  case NP_OP_FMIN:
+    return 3;
+  case NP_OP_FMAX:
+    return 3;
+  case NP_OP_SIN:
+    return 10;
+  case NP_OP_COS:
+    return 10;
+  case NP_OP_TAN:
+    return 10;
+  case NP_OP_ARCSIN:
+    return 20;
+  case NP_OP_ARCCOS:
+    return 20;
+  case NP_OP_ARCTAN:
+    return 20;
+  case NP_OP_ARCTAN2:
+    return 35;
+  case NP_OP_HYPOT:
+    return 5;
+  case NP_OP_SINH:
+    return 10;
+  case NP_OP_COSH:
+    return 10;
+  case NP_OP_TANH:
+    return 10;
+  case NP_OP_ARCSINH:
+    return 10;
+  case NP_OP_ARCCOSH:
+    return 10;
+  case NP_OP_ARCTANH:
+    return 10;
+  case NP_OP_CONJ:
+    return 1;
+  case NP_OP_EXP:
+    return 5;
+  case NP_OP_EXP2:
+    return 5;
+  case NP_OP_LOG:
+    return 5;
+  case NP_OP_LOG2:
+    return 5;
+  case NP_OP_LOG10:
+    return 5;
+  case NP_OP_EXPM1:
+    return 5;
+  case NP_OP_LOG1P:
+    return 5;
+  case NP_OP_SQRT:
+    return 2;
+  case NP_OP_SQUARE:
+    return 1;
+  case NP_OP_CBRT:
+    return 5;
+  case NP_OP_LOGADDEXP:
+    return 10;
+  case NP_OP_LOGADDEXP2:
+    return 10;
+  case NP_OP_RECIPROCAL:
+    return 1;
+  case NP_OP_RINT:
+    return 1;
+  case NP_OP_FLOOR:
+    return 1;
+  case NP_OP_CEIL:
+    return 1;
+  case NP_OP_TRUNC:
+    return 1;
+  case NP_OP_ISNAN:
+    return 1;
+  case NP_OP_ISINF:
+    return 1;
+  case NP_OP_ISFINITE:
+    return 1;
+  case NP_OP_SIGN:
+    return 1;
+  case NP_OP_SIGNBIT:
+    return 1;
+  case NP_OP_COPYSIGN:
+    return 1;
+  case NP_OP_SPACING:
+    return 1;
+  case NP_OP_NEXTAFTER:
+    return 1;
+  case NP_OP_DEG2RAD:
+    return 2;
+  case NP_OP_RAD2DEG:
+    return 2;
+  case NP_OP_HEAVISIDE:
+    return 3;
+  }
+}
+
+int64_t NumPyExpr::cost() const {
+  auto c = opcost();
+  if (c == -1)
+    return -1;
+
+  // Account for the fact that the vectorized loops are much faster.
+  if (haveVectorizedLoop()) {
+    c *= 3;
+    if (lhs->type.dtype == NumPyType::NP_TYPE_ARR_F32)
+      c *= 2;
+  }
+
+  bool lhsIntConst = (lhs && lhs->isLeaf() && isA<IntConst>(lhs->val));
+  bool rhsIntConst = (rhs && rhs->isLeaf() && isA<IntConst>(rhs->val));
+  bool lhsFloatConst = (lhs && lhs->isLeaf() && isA<FloatConst>(lhs->val));
+  bool rhsFloatConst = (rhs && rhs->isLeaf() && isA<FloatConst>(rhs->val));
+  bool lhsConst = lhsIntConst || lhsFloatConst;
+  bool rhsConst = rhsIntConst || rhsFloatConst;
+
+  if (rhsConst || lhsConst) {
+    switch (op) {
+    case NP_OP_TRUE_DIV:
+    case NP_OP_FLOOR_DIV:
+    case NP_OP_MOD:
+    case NP_OP_FMOD:
+      c = 1;
+      break;
+    case NP_OP_POW:
+      if (rhsIntConst)
+        c = (cast<IntConst>(rhs->val)->getVal() == 2) ? 1 : 5;
+      break;
+    default:
+      break;
+    }
+  }
+
+  if (lhs) {
+    auto cl = lhs->cost();
+    if (cl == -1)
+      return -1;
+    c += cl;
+  }
+
+  if (rhs) {
+    auto cr = rhs->cost();
+    if (cr == -1)
+      return -1;
+    c += cr;
+  }
+
+  return c;
+}
+
+std::string NumPyExpr::opstring() const {
+  static const std::unordered_map<Op, std::string> m = {
+      {NP_OP_NONE, "a"},
+      {NP_OP_POS, "pos"},
+      {NP_OP_NEG, "neg"},
+      {NP_OP_INVERT, "invert"},
+      {NP_OP_ABS, "abs"},
+      {NP_OP_TRANSPOSE, "transpose"},
+      {NP_OP_ADD, "add"},
+      {NP_OP_SUB, "sub"},
+      {NP_OP_MUL, "mul"},
+      {NP_OP_MATMUL, "matmul"},
+      {NP_OP_TRUE_DIV, "true_div"},
+      {NP_OP_FLOOR_DIV, "floor_div"},
+      {NP_OP_MOD, "mod"},
+      {NP_OP_FMOD, "fmod"},
+      {NP_OP_POW, "pow"},
+      {NP_OP_LSHIFT, "lshift"},
+      {NP_OP_RSHIFT, "rshift"},
+      {NP_OP_AND, "and"},
+      {NP_OP_OR, "or"},
+      {NP_OP_XOR, "xor"},
+      {NP_OP_LOGICAL_AND, "logical_and"},
+      {NP_OP_LOGICAL_OR, "logical_or"},
+      {NP_OP_LOGICAL_XOR, "logical_xor"},
+      {NP_OP_EQ, "eq"},
+      {NP_OP_NE, "ne"},
+      {NP_OP_LT, "lt"},
+      {NP_OP_LE, "le"},
+      {NP_OP_GT, "gt"},
+      {NP_OP_GE, "ge"},
+      {NP_OP_MIN, "minimum"},
+      {NP_OP_MAX, "maximum"},
+      {NP_OP_FMIN, "fmin"},
+      {NP_OP_FMAX, "fmax"},
+      {NP_OP_SIN, "sin"},
+      {NP_OP_COS, "cos"},
+      {NP_OP_TAN, "tan"},
+      {NP_OP_ARCSIN, "arcsin"},
+      {NP_OP_ARCCOS, "arccos"},
+      {NP_OP_ARCTAN, "arctan"},
+      {NP_OP_ARCTAN2, "arctan2"},
+      {NP_OP_HYPOT, "hypot"},
+      {NP_OP_SINH, "sinh"},
+      {NP_OP_COSH, "cosh"},
+      {NP_OP_TANH, "tanh"},
+      {NP_OP_ARCSINH, "arcsinh"},
+      {NP_OP_ARCCOSH, "arccosh"},
+      {NP_OP_ARCTANH, "arctanh"},
+      {NP_OP_CONJ, "conj"},
+      {NP_OP_EXP, "exp"},
+      {NP_OP_EXP2, "exp2"},
+      {NP_OP_LOG, "log"},
+      {NP_OP_LOG2, "log2"},
+      {NP_OP_LOG10, "log10"},
+      {NP_OP_EXPM1, "expm1"},
+      {NP_OP_LOG1P, "log1p"},
+      {NP_OP_SQRT, "sqrt"},
+      {NP_OP_SQUARE, "square"},
+      {NP_OP_CBRT, "cbrt"},
+      {NP_OP_LOGADDEXP, "logaddexp"},
+      {NP_OP_LOGADDEXP2, "logaddexp2"},
+      {NP_OP_RECIPROCAL, "reciprocal"},
+      {NP_OP_RINT, "rint"},
+      {NP_OP_FLOOR, "floor"},
+      {NP_OP_CEIL, "ceil"},
+      {NP_OP_TRUNC, "trunc"},
+      {NP_OP_ISNAN, "isnan"},
+      {NP_OP_ISINF, "isinf"},
+      {NP_OP_ISFINITE, "isfinite"},
+      {NP_OP_SIGN, "sign"},
+      {NP_OP_SIGNBIT, "signbit"},
+      {NP_OP_COPYSIGN, "copysign"},
+      {NP_OP_SPACING, "spacing"},
+      {NP_OP_NEXTAFTER, "nextafter"},
+      {NP_OP_DEG2RAD, "deg2rad"},
+      {NP_OP_RAD2DEG, "rad2deg"},
+      {NP_OP_HEAVISIDE, "heaviside"},
+  };
+
+  auto it = m.find(op);
+  seqassertn(it != m.end(), "op not found");
+  return it->second;
+}
+
+void NumPyExpr::dump(std::ostream &os, int level, int &leafId) const {
+  auto indent = [&]() {
+    for (int i = 0; i < level; i++)
+      os << "  ";
+  };
+
+  indent();
+  if (op == NP_OP_NONE) {
+    os << "\033[1;36m" << opstring() << leafId;
+    ++leafId;
+  } else {
+    os << "\033[1;33m" << opstring();
+  }
+  os << "\033[0m <" << type << ">";
+  if (op != NP_OP_NONE)
+    os << " \033[1;35m[cost=" << cost() << "]\033[0m";
+  os << "\n";
+  if (lhs)
+    lhs->dump(os, level + 1, leafId);
+  if (rhs)
+    rhs->dump(os, level + 1, leafId);
+}
+
+std::ostream &operator<<(std::ostream &os, NumPyExpr const &expr) {
+  int leafId = 0;
+  expr.dump(os, 0, leafId);
+  return os;
+}
+
+std::string NumPyExpr::str() const {
+  std::stringstream buffer;
+  buffer << *this;
+  return buffer.str();
+}
+
+void NumPyExpr::apply(std::function<void(NumPyExpr &)> f) {
+  f(*this);
+  if (lhs)
+    lhs->apply(f);
+  if (rhs)
+    rhs->apply(f);
+}
+
+Value *NumPyExpr::codegenBroadcasts(CodegenContext &C) {
+  auto *M = C.M;
+  auto &vars = C.vars;
+
+  Value *targetShape = nullptr;
+  Value *result = nullptr;
+
+  apply([&](NumPyExpr &e) {
+    if (e.isLeaf() && e.type.isArray()) {
+      auto it = vars.find(&e);
+      seqassertn(it != vars.end(),
+                 "NumPyExpr not found in vars map (codegen broadcasts)");
+      auto *var = it->second;
+      auto *shape = M->getOrRealizeFunc("_shape", {var->getType()}, {}, FUSION_MODULE);
+      seqassertn(shape, "shape function not found");
+      auto *leafShape = util::call(shape, {M->Nr<VarValue>(var)});
+
+      if (!targetShape) {
+        targetShape = leafShape;
+      } else {
+        auto *diff = (*targetShape != *leafShape);
+        if (result) {
+          result = *result | *diff;
+        } else {
+          result = diff;
+        }
+      }
+    }
+  });
+
+  return result ? result : M->getBool(false);
+}
+
+Var *NumPyExpr::codegenFusedEval(CodegenContext &C) {
+  auto *M = C.M;
+  auto *series = C.series;
+  auto *func = C.func;
+  auto &vars = C.vars;
+  auto &T = C.T;
+
+  std::vector<std::pair<NumPyExpr *, Var *>> leaves;
+  apply([&](NumPyExpr &e) {
+    if (e.isLeaf()) {
+      auto it = vars.find(&e);
+      seqassertn(it != vars.end(), "NumPyExpr not found in vars map (fused eval)");
+      auto *var = it->second;
+      leaves.emplace_back(&e, var);
+    }
+  });
+
+  // Arrays for scalar expression function
+  std::vector<Value *> arrays;
+  std::vector<std::string> scalarFuncArgNames;
+  std::vector<types::Type *> scalarFuncArgTypes;
+  std::unordered_map<NumPyExpr *, Var *> scalarFuncArgMap;
+
+  // Scalars passed through 'extra' arg of ndarray._loop()
+  std::vector<Value *> extra;
+  std::unordered_map<NumPyExpr *, unsigned> extraMap;
+
+  auto *baseType = type.getIRBaseType(T);
+  scalarFuncArgNames.push_back("out");
+  scalarFuncArgTypes.push_back(M->getPointerType(baseType));
+
+  unsigned argIdx = 0;
+  unsigned extraIdx = 0;
+
+  for (auto &e : leaves) {
+    if (e.first->type.isArray()) {
+      arrays.push_back(M->Nr<VarValue>(e.second));
+      scalarFuncArgNames.push_back("in" + std::to_string(argIdx++));
+      scalarFuncArgTypes.push_back(M->getPointerType(e.first->type.getIRBaseType(T)));
+    } else {
+      extra.push_back(M->Nr<VarValue>(e.second));
+      extraMap.emplace(e.first, extraIdx++);
+    }
+  }
+
+  auto *extraTuple = util::makeTuple(extra, M);
+  scalarFuncArgNames.push_back("extra");
+  scalarFuncArgTypes.push_back(extraTuple->getType());
+  auto *scalarFuncType = M->getFuncType(M->getNoneType(), scalarFuncArgTypes);
+  auto *scalarFunc = M->Nr<BodiedFunc>("__numpy_fusion_scalar_fn");
+  scalarFunc->realize(scalarFuncType, scalarFuncArgNames);
+  std::vector<Var *> scalarFuncArgVars(scalarFunc->arg_begin(), scalarFunc->arg_end());
+
+  argIdx = 1;
+  for (auto &e : leaves) {
+    if (e.first->type.isArray()) {
+      scalarFuncArgMap.emplace(e.first, scalarFuncArgVars[argIdx++]);
+    }
+  }
+  auto *scalarExpr =
+      codegenScalarExpr(C, scalarFuncArgMap, extraMap, scalarFuncArgVars.back());
+  auto *ptrsetFunc = M->getOrRealizeFunc("_ptrset", {scalarFuncArgTypes[0], baseType},
+                                         {}, FUSION_MODULE);
+  seqassertn(ptrsetFunc, "ptrset func not found");
+  scalarFunc->setBody(util::series(
+      util::call(ptrsetFunc, {M->Nr<VarValue>(scalarFuncArgVars[0]), scalarExpr})));
+
+  auto *arraysTuple = util::makeTuple(arrays);
+  auto *loopFunc = M->getOrRealizeFunc(
+      "_loop_alloc",
+      {arraysTuple->getType(), scalarFunc->getType(), extraTuple->getType()},
+      {baseType}, FUSION_MODULE);
+  seqassertn(loopFunc, "loop_alloc func not found");
+
+  auto *result = util::makeVar(
+      util::call(loopFunc, {arraysTuple, M->Nr<VarValue>(scalarFunc), extraTuple}),
+      series, func);
+
+  // Free temporary arrays
+  apply([&](NumPyExpr &e) {
+    if (e.isLeaf() && e.freeable) {
+      auto it = vars.find(&e);
+      seqassertn(it != vars.end(), "NumPyExpr not found in vars map (fused eval)");
+      auto *var = it->second;
+      auto *freeFunc =
+          M->getOrRealizeFunc("_free", {var->getType()}, {}, FUSION_MODULE);
+      seqassertn(freeFunc, "free func not found");
+      series->push_back(util::call(freeFunc, {M->Nr<VarValue>(var)}));
+    }
+  });
+
+  return result;
+}
+
+Var *NumPyExpr::codegenSequentialEval(CodegenContext &C) {
+  auto *M = C.M;
+  auto *series = C.series;
+  auto *func = C.func;
+  auto &vars = C.vars;
+  auto &T = C.T;
+
+  if (isLeaf()) {
+    auto it = vars.find(this);
+    seqassertn(it != vars.end(),
+               "NumPyExpr not found in vars map (codegen sequential eval)");
+    return it->second;
+  }
+
+  Var *lv = lhs->codegenSequentialEval(C);
+  Var *rv = rhs ? rhs->codegenSequentialEval(C) : nullptr;
+  Var *like = nullptr;
+  Value *outShapeVal = nullptr;
+
+  if (rv) {
+    // Can't do anything special with matmul here...
+    if (op == NP_OP_MATMUL) {
+      auto *matmul = M->getOrRealizeFunc("_matmul", {lv->getType(), rv->getType()}, {},
+                                         FUSION_MODULE);
+      return util::makeVar(
+          util::call(matmul, {M->Nr<VarValue>(lv), M->Nr<VarValue>(rv)}), series, func);
+    }
+
+    auto *lshape = M->getOrRealizeFunc("_shape", {lv->getType()}, {}, FUSION_MODULE);
+    seqassertn(lshape, "shape func not found for left arg");
+    auto *rshape = M->getOrRealizeFunc("_shape", {rv->getType()}, {}, FUSION_MODULE);
+    seqassertn(rshape, "shape func not found for right arg");
+    auto *leftShape = util::call(lshape, {M->Nr<VarValue>(lv)});
+    auto *rightShape = util::call(rshape, {M->Nr<VarValue>(rv)});
+    auto *shape = M->getOrRealizeFunc(
+        "_broadcast", {leftShape->getType(), rightShape->getType()}, {}, FUSION_MODULE);
+    seqassertn(shape, "output shape func not found");
+    like = rhs->type.ndim > lhs->type.ndim ? rv : lv;
+    outShapeVal = util::call(shape, {leftShape, rightShape});
+  } else {
+    auto *shape = M->getOrRealizeFunc("_shape", {lv->getType()}, {}, FUSION_MODULE);
+    seqassertn(shape, "shape func not found");
+    like = lv;
+    outShapeVal = util::call(shape, {M->Nr<VarValue>(lv)});
+  }
+
+  auto *outShape = util::makeVar(outShapeVal, series, func);
+  Var *result = nullptr;
+
+  bool lfreeable = lhs && lhs->type.isArray() && (lhs->freeable || !lhs->isLeaf());
+  bool rfreeable = rhs && rhs->type.isArray() && (rhs->freeable || !rhs->isLeaf());
+  bool ltmp = lfreeable && lhs->type.dtype == type.dtype && lhs->type.ndim == type.ndim;
+  bool rtmp = rfreeable && rhs->type.dtype == type.dtype && rhs->type.ndim == type.ndim;
+
+  auto *t = type.getIRBaseType(T);
+  auto newArray = [&]() {
+    auto *create = M->getOrRealizeFunc(
+        "_create", {like->getType(), outShape->getType()}, {t}, FUSION_MODULE);
+    seqassertn(create, "create func not found");
+    return util::call(create, {M->Nr<VarValue>(like), M->Nr<VarValue>(outShape)});
+  };
+
+  bool freeLeftStatic = false;
+  bool freeRightStatic = false;
+  Var *lcond = nullptr;
+  Var *rcond = nullptr;
+
+  if (rv) {
+    if (ltmp && rhs->type.ndim == 0) {
+      // We are adding lhs temp array to const or 0-dim array, so reuse lhs array.
+      result = lv;
+    } else if (rtmp && lhs->type.ndim == 0) {
+      // We are adding rhs temp array to const or 0-dim array, so reuse rhs array.
+      result = rv;
+    } else if (!ltmp && !rtmp) {
+      // Neither operand is a temp array, so we must allocate a new array.
+      result = util::makeVar(newArray(), series, func);
+      freeLeftStatic = lfreeable;
+      freeRightStatic = rfreeable;
+    } else if (ltmp && rtmp) {
+      // We won't know until runtime if we can reuse the temp array(s) since they
+      // might broadcast.
+      auto *lshape = M->getOrRealizeFunc("_shape", {lv->getType()}, {}, FUSION_MODULE);
+      seqassertn(lshape, "shape function func not found for left arg");
+      auto *rshape = M->getOrRealizeFunc("_shape", {rv->getType()}, {}, FUSION_MODULE);
+      seqassertn(rshape, "shape function func not found for right arg");
+      auto *leftShape = util::call(lshape, {M->Nr<VarValue>(lv)});
+      auto *rightShape = util::call(rshape, {M->Nr<VarValue>(rv)});
+      lcond = util::makeVar(*leftShape == *M->Nr<VarValue>(outShape), series, func);
+      rcond = util::makeVar(*rightShape == *M->Nr<VarValue>(outShape), series, func);
+      auto *arr = M->Nr<TernaryInstr>(
+          M->Nr<VarValue>(lcond), M->Nr<VarValue>(lv),
+          M->Nr<TernaryInstr>(M->Nr<VarValue>(rcond), M->Nr<VarValue>(rv), newArray()));
+      result = util::makeVar(arr, series, func);
+    } else if (ltmp && !rtmp) {
+      // We won't know until runtime if we can reuse the temp array(s) since they
+      // might broadcast.
+      auto *lshape = M->getOrRealizeFunc("_shape", {lv->getType()}, {}, FUSION_MODULE);
+      seqassertn(lshape, "shape function func not found for left arg");
+      auto *leftShape = util::call(lshape, {M->Nr<VarValue>(lv)});
+      lcond = util::makeVar(*leftShape == *M->Nr<VarValue>(outShape), series, func);
+      auto *arr =
+          M->Nr<TernaryInstr>(M->Nr<VarValue>(lcond), M->Nr<VarValue>(lv), newArray());
+      result = util::makeVar(arr, series, func);
+      freeRightStatic = rfreeable;
+    } else if (!ltmp && rtmp) {
+      // We won't know until runtime if we can reuse the temp array(s) since they
+      // might broadcast.
+      auto *rshape = M->getOrRealizeFunc("_shape", {rv->getType()}, {}, FUSION_MODULE);
+      seqassertn(rshape, "shape function func not found for right arg");
+      auto *rightShape = util::call(rshape, {M->Nr<VarValue>(rv)});
+      rcond = util::makeVar(*rightShape == *M->Nr<VarValue>(outShape), series, func);
+      auto *arr =
+          M->Nr<TernaryInstr>(M->Nr<VarValue>(rcond), M->Nr<VarValue>(rv), newArray());
+      result = util::makeVar(arr, series, func);
+      freeLeftStatic = lfreeable;
+    }
+  } else {
+    if (ltmp) {
+      result = lv;
+    } else {
+      result = util::makeVar(newArray(), series, func);
+      freeLeftStatic = lfreeable;
+    }
+  }
+
+  auto opstr = opstring();
+
+  if (haveVectorizedLoop()) {
+    // We have a vectorized loop available for this operations.
+    if (rv) {
+      auto *vecloop = M->getOrRealizeFunc(
+          "_apply_vectorized_loop_binary",
+          {lv->getType(), rv->getType(), result->getType()}, {opstr}, FUSION_MODULE);
+      seqassertn(vecloop, "binary vec loop func not found ({})", opstr);
+      series->push_back(util::call(vecloop, {M->Nr<VarValue>(lv), M->Nr<VarValue>(rv),
+                                             M->Nr<VarValue>(result)}));
+    } else {
+      auto *vecloop = M->getOrRealizeFunc("_apply_vectorized_loop_unary",
+                                          {lv->getType(), result->getType()}, {opstr},
+                                          FUSION_MODULE);
+      seqassertn(vecloop, "unary vec loop func not found ({})", opstr);
+      series->push_back(
+          util::call(vecloop, {M->Nr<VarValue>(lv), M->Nr<VarValue>(result)}));
+    }
+  } else {
+    // Arrays for scalar expression function
+    std::vector<Value *> arrays = {M->Nr<VarValue>(result)};
+    std::vector<std::string> scalarFuncArgNames;
+    std::vector<types::Type *> scalarFuncArgTypes;
+    std::unordered_map<NumPyExpr *, Var *> scalarFuncArgMap;
+
+    // Scalars passed through 'extra' arg of ndarray._loop()
+    std::vector<Value *> extra;
+
+    auto *baseType = type.getIRBaseType(T);
+    scalarFuncArgNames.push_back("out");
+    scalarFuncArgTypes.push_back(M->getPointerType(baseType));
+
+    if (lhs->type.isArray()) {
+      if (result != lv) {
+        scalarFuncArgNames.push_back("in0");
+        scalarFuncArgTypes.push_back(M->getPointerType(lhs->type.getIRBaseType(T)));
+        arrays.push_back(M->Nr<VarValue>(lv));
+      }
+    } else {
+      extra.push_back(M->Nr<VarValue>(lv));
+    }
+
+    if (rv) {
+      if (rhs->type.isArray()) {
+        if (result != rv) {
+          scalarFuncArgNames.push_back("in1");
+          scalarFuncArgTypes.push_back(M->getPointerType(rhs->type.getIRBaseType(T)));
+          arrays.push_back(M->Nr<VarValue>(rv));
+        }
+      } else {
+        extra.push_back(M->Nr<VarValue>(rv));
+      }
+    }
+
+    auto *extraTuple = util::makeTuple(extra, M);
+    scalarFuncArgNames.push_back("extra");
+    scalarFuncArgTypes.push_back(extraTuple->getType());
+    auto *scalarFuncType = M->getFuncType(M->getNoneType(), scalarFuncArgTypes);
+    auto *scalarFunc = M->Nr<BodiedFunc>("__numpy_fusion_scalar_fn");
+    scalarFunc->realize(scalarFuncType, scalarFuncArgNames);
+    std::vector<Var *> scalarFuncArgVars(scalarFunc->arg_begin(),
+                                         scalarFunc->arg_end());
+    auto *body = M->Nr<SeriesFlow>();
+    auto name = "_" + opstr;
+
+    auto deref = [&](unsigned idx) {
+      return (*M->Nr<VarValue>(scalarFuncArgVars[idx]))[*M->getInt(0)];
+    };
+
+    if (rv) {
+      Value *litem = nullptr;
+      Value *ritem = nullptr;
+
+      if (lhs->type.isArray() && rhs->type.isArray()) {
+        if (result == lv) {
+          litem = deref(0);
+          ritem = deref(1);
+        } else if (result == rv) {
+          litem = deref(1);
+          ritem = deref(0);
+        } else {
+          litem = deref(1);
+          ritem = deref(2);
+        }
+      } else if (lhs->type.isArray()) {
+        if (result == lv) {
+          litem = deref(0);
+        } else {
+          litem = deref(1);
+        }
+        ritem = util::tupleGet(M->Nr<VarValue>(scalarFuncArgVars.back()), 0);
+      } else if (rhs->type.isArray()) {
+        if (result == rv) {
+          ritem = deref(0);
+        } else {
+          ritem = deref(1);
+        }
+        litem = util::tupleGet(M->Nr<VarValue>(scalarFuncArgVars.back()), 0);
+      } else {
+        seqassertn(false, "both lhs are rhs are scalars");
+      }
+
+      auto *commonType = decideTypes(this, lhs->type, rhs->type, T);
+
+      auto *lcast =
+          M->getOrRealizeFunc("_cast", {litem->getType()}, {commonType}, FUSION_MODULE);
+      seqassertn(lcast, "cast func not found for left arg");
+      litem = util::call(lcast, {litem});
+
+      auto *rcast =
+          M->getOrRealizeFunc("_cast", {ritem->getType()}, {commonType}, FUSION_MODULE);
+      seqassertn(rcast, "cast func not found for left arg");
+      ritem = util::call(rcast, {ritem});
+
+      auto *op = M->getOrRealizeFunc(name, {litem->getType(), ritem->getType()}, {},
+                                     FUSION_MODULE);
+      seqassertn(op, "2-op func '{}' not found", name);
+      auto *oitem = util::call(op, {litem, ritem});
+      auto *ptrsetFunc = M->getOrRealizeFunc(
+          "_ptrset", {scalarFuncArgTypes[0], oitem->getType()}, {}, FUSION_MODULE);
+      seqassertn(ptrsetFunc, "ptrset func not found");
+      body->push_back(
+          util::call(ptrsetFunc, {M->Nr<VarValue>(scalarFuncArgVars[0]), oitem}));
+    } else {
+      auto *litem = deref(result == lv ? 0 : 1);
+      auto *op = M->getOrRealizeFunc(name, {litem->getType()}, {}, FUSION_MODULE);
+      seqassertn(op, "1-op func '{}' not found", name);
+      auto *oitem = util::call(op, {litem});
+      auto *ptrsetFunc = M->getOrRealizeFunc(
+          "_ptrset", {scalarFuncArgTypes[0], oitem->getType()}, {}, FUSION_MODULE);
+      seqassertn(ptrsetFunc, "ptrset func not found");
+      body->push_back(
+          util::call(ptrsetFunc, {M->Nr<VarValue>(scalarFuncArgVars[0]), oitem}));
+    }
+
+    scalarFunc->setBody(body);
+    auto *arraysTuple = util::makeTuple(arrays);
+    auto *loopFunc = M->getOrRealizeFunc(
+        "_loop_basic",
+        {arraysTuple->getType(), scalarFunc->getType(), extraTuple->getType()}, {},
+        FUSION_MODULE);
+    seqassertn(loopFunc, "loop_basic func not found");
+    series->push_back(
+        util::call(loopFunc, {arraysTuple, M->Nr<VarValue>(scalarFunc), extraTuple}));
+  }
+
+  auto freeArray = [&](Var *arr) {
+    auto *freeFunc = M->getOrRealizeFunc("_free", {arr->getType()}, {}, FUSION_MODULE);
+    seqassertn(freeFunc, "free func not found");
+    return util::call(freeFunc, {M->Nr<VarValue>(arr)});
+  };
+
+  seqassertn(!(freeLeftStatic && lcond), "unexpected free conditions for left arg");
+  seqassertn(!(freeRightStatic && rcond), "unexpected free conditions for right arg");
+
+  if (lcond && rcond) {
+    series->push_back(M->Nr<IfFlow>(
+        M->Nr<VarValue>(lcond), util::series(freeArray(rv)),
+        util::series(freeArray(lv),
+                     M->Nr<IfFlow>(M->Nr<VarValue>(rcond), M->Nr<SeriesFlow>(),
+                                   util::series(freeArray(rv))))));
+  } else {
+    if (freeLeftStatic) {
+      series->push_back(freeArray(lv));
+    } else if (lcond) {
+      series->push_back(M->Nr<IfFlow>(M->Nr<VarValue>(lcond), M->Nr<SeriesFlow>(),
+                                      util::series(freeArray(lv))));
+    }
+
+    if (freeRightStatic) {
+      series->push_back(freeArray(rv));
+    } else if (rcond) {
+      series->push_back(M->Nr<IfFlow>(M->Nr<VarValue>(rcond), M->Nr<SeriesFlow>(),
+                                      util::series(freeArray(rv))));
+    }
+  }
+
+  return result;
+}
+
+BroadcastInfo NumPyExpr::getBroadcastInfo() {
+  int64_t arrDim = -1;
+  Var *varLeaf = nullptr;
+  bool multipleLeafVars = false;
+  int numNonVarLeafArrays = 0;
+  bool definitelyBroadcasts = false;
+
+  apply([&](NumPyExpr &e) {
+    if (e.isLeaf() && e.type.isArray()) {
+      if (arrDim == -1) {
+        arrDim = e.type.ndim;
+      } else if (arrDim != e.type.ndim) {
+        definitelyBroadcasts = true;
+      }
+
+      if (auto *v = cast<VarValue>(e.val)) {
+        if (varLeaf) {
+          if (varLeaf != v->getVar())
+            multipleLeafVars = true;
+        } else {
+          varLeaf = v->getVar();
+        }
+      } else {
+        ++numNonVarLeafArrays;
+      }
+    }
+  });
+
+  bool mightBroadcast = numNonVarLeafArrays > 1 || multipleLeafVars ||
+                        (numNonVarLeafArrays == 1 && varLeaf);
+  if (definitelyBroadcasts) {
+    return BroadcastInfo::YES;
+  } else if (mightBroadcast) {
+    return BroadcastInfo::MAYBE;
+  } else {
+    return BroadcastInfo::NO;
+  }
+}
+
+Value *NumPyExpr::codegenScalarExpr(
+    CodegenContext &C, const std::unordered_map<NumPyExpr *, Var *> &args,
+    const std::unordered_map<NumPyExpr *, unsigned> &scalarMap, Var *scalars) {
+  auto *M = C.M;
+  auto &T = C.T;
+
+  Value *lv = lhs ? lhs->codegenScalarExpr(C, args, scalarMap, scalars) : nullptr;
+  Value *rv = rhs ? rhs->codegenScalarExpr(C, args, scalarMap, scalars) : nullptr;
+  auto name = "_" + opstring();
+
+  if (lv && rv) {
+    auto *t = type.getIRBaseType(T);
+    auto *commonType = decideTypes(this, lhs->type, rhs->type, T);
+    auto *cast1 =
+        M->getOrRealizeFunc("_cast", {lv->getType()}, {commonType}, FUSION_MODULE);
+    auto *cast2 =
+        M->getOrRealizeFunc("_cast", {rv->getType()}, {commonType}, FUSION_MODULE);
+    lv = util::call(cast1, {lv});
+    rv = util::call(cast2, {rv});
+    auto *f =
+        M->getOrRealizeFunc(name, {lv->getType(), rv->getType()}, {}, FUSION_MODULE);
+    seqassertn(f, "2-op func '{}' not found", name);
+    return util::call(f, {lv, rv});
+  } else if (lv) {
+    auto *t = type.getIRBaseType(T);
+    auto *f = M->getOrRealizeFunc(name, {lv->getType()}, {}, FUSION_MODULE);
+    seqassertn(f, "1-op func '{}' not found", name);
+    return util::call(f, {lv});
+  } else {
+    if (type.isArray()) {
+      auto it = args.find(this);
+      seqassertn(it != args.end(), "NumPyExpr not found in args map (codegen expr)");
+      auto *var = it->second;
+      return (*M->Nr<VarValue>(var))[*M->getInt(0)];
+    } else {
+      auto it = scalarMap.find(this);
+      seqassertn(it != scalarMap.end(),
+                 "NumPyExpr not found in scalar map (codegen expr)");
+      auto idx = it->second;
+      return util::tupleGet(M->Nr<VarValue>(scalars), idx);
+    }
+  }
+}
+
+} // namespace numpy
+} // namespace transform
+} // namespace ir
+} // namespace codon
--- a/codon/cir/transform/numpy/forward.cpp
+++ b/codon/cir/transform/numpy/forward.cpp
@ -0,0 +1,385 @@
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>
+
+#include "numpy.h"
+
+namespace codon {
+namespace ir {
+namespace transform {
+namespace numpy {
+namespace {
+using CFG = analyze::dataflow::CFGraph;
+using CFBlock = analyze::dataflow::CFBlock;
+using RD = analyze::dataflow::RDInspector;
+using SE = analyze::module::SideEffectResult;
+
+struct GetVars : public util::Operator {
+  std::unordered_set<id_t> &vids;
+
+  explicit GetVars(std::unordered_set<id_t> &vids) : util::Operator(), vids(vids) {}
+
+  void preHook(Node *v) override {
+    for (auto *var : v->getUsedVariables()) {
+      if (!isA<Func>(var))
+        vids.insert(var->getId());
+    }
+  }
+};
+
+struct OkToForwardPast : public util::Operator {
+  std::unordered_set<id_t> &vids;
+  const std::unordered_map<id_t, NumPyExpr *> &parsedValues;
+  SE *se;
+  bool ok;
+
+  OkToForwardPast(std::unordered_set<id_t> &vids,
+                  const std::unordered_map<id_t, NumPyExpr *> &parsedValues, SE *se)
+      : util::Operator(), vids(vids), parsedValues(parsedValues), se(se), ok(true) {}
+
+  void preHook(Node *v) override {
+    if (!ok) {
+      return;
+    } else if (auto *assign = cast<AssignInstr>(v)) {
+      if (vids.count(assign->getLhs()->getId()))
+        ok = false;
+    } else if (auto *val = cast<Value>(v)) {
+      auto it = parsedValues.find(val->getId());
+      if (it != parsedValues.end()) {
+        it->second->apply([&](NumPyExpr &e) {
+          if (e.isLeaf() && se->hasSideEffect(e.val))
+            ok = false;
+        });
+        // Skip children since we are processing them manually above.
+        for (auto *used : val->getUsedValues())
+          see(used);
+      } else if (se->hasSideEffect(val)) {
+        ok = false;
+      }
+    }
+  }
+};
+
+struct GetAllUses : public util::Operator {
+  Var *var;
+  std::vector<Value *> &uses;
+
+  GetAllUses(Var *var, std::vector<Value *> &uses)
+      : util::Operator(), var(var), uses(uses) {}
+
+  void preHook(Node *n) override {
+    if (auto *v = cast<Value>(n)) {
+      auto vars = v->getUsedVariables();
+      if (std::find(vars.begin(), vars.end(), var) != vars.end())
+        uses.push_back(v);
+    }
+  }
+};
+
+bool canForwardExpressionAlongPath(
+    Value *source, Value *destination, std::unordered_set<id_t> &vids,
+    const std::unordered_map<id_t, NumPyExpr *> &parsedValues, SE *se,
+    const std::vector<CFBlock *> &path) {
+  if (path.empty())
+    return true;
+
+  bool go = false;
+  for (auto *block : path) {
+    for (const auto *value : *block) {
+      // Skip things before 'source' in first block
+      if (!go && block == path.front() && value == source) {
+        go = true;
+        continue;
+      }
+
+      // Skip things after 'destination' in last block
+      if (go && block == path.back() && value == destination) {
+        go = false;
+        break;
+      }
+
+      if (!go)
+        continue;
+
+      OkToForwardPast check(vids, parsedValues, se);
+      const_cast<Value *>(value)->accept(check);
+      if (!check.ok)
+        return false;
+    }
+  }
+  return true;
+}
+
+bool canForwardExpression(NumPyOptimizationUnit *expr, Value *target,
+                          const std::unordered_map<id_t, NumPyExpr *> &parsedValues,
+                          CFG *cfg, SE *se) {
+  std::unordered_set<id_t> vids;
+  bool pure = true;
+
+  expr->expr->apply([&](NumPyExpr &e) {
+    if (e.isLeaf()) {
+      if (se->hasSideEffect(e.val)) {
+        pure = false;
+      } else {
+        GetVars gv(vids);
+        e.val->accept(gv);
+      }
+    }
+  });
+
+  if (!pure)
+    return false;
+
+  auto *source = expr->assign;
+  auto *start = cfg->getBlock(source);
+  auto *end = cfg->getBlock(target);
+  seqassertn(start, "start CFG block not found");
+  seqassertn(end, "end CFG block not found");
+  bool ok = true;
+
+  std::function<void(CFBlock *, std::vector<CFBlock *> &)> dfs =
+      [&](CFBlock *curr, std::vector<CFBlock *> &path) {
+        path.push_back(curr);
+        if (curr == end) {
+          if (!canForwardExpressionAlongPath(source, target, vids, parsedValues, se,
+                                             path))
+            ok = false;
+        } else {
+          for (auto it = curr->successors_begin(); it != curr->successors_end(); ++it) {
+            if (std::find(path.begin(), path.end(), *it) != path.end())
+              dfs(*it, path);
+          }
+        }
+        path.pop_back();
+      };
+
+  std::vector<CFBlock *> path;
+  dfs(start, path);
+  return ok;
+}
+
+bool canForwardVariable(AssignInstr *assign, Value *destination, BodiedFunc *func,
+                        RD *rd) {
+  auto *var = assign->getLhs();
+
+  // Check 1: Only the given assignment should reach the destination.
+  auto reaching = rd->getReachingDefinitions(var, destination);
+  if (reaching.size() != 1 && *reaching.begin() != assign->getRhs()->getId())
+    return false;
+
+  // Check 2: There should be no other uses of the variable that the given assignment
+  // reaches.
+  std::vector<Value *> uses;
+  GetAllUses gu(var, uses);
+  func->accept(gu);
+  for (auto *use : uses) {
+    if (use != destination && use->getId() != assign->getId() &&
+        rd->getReachingDefinitions(var, use).count(assign->getRhs()->getId()))
+      return false;
+  }
+
+  return true;
+}
+
+ForwardingDAG buildForwardingDAG(BodiedFunc *func, RD *rd, CFG *cfg, SE *se,
+                                 std::vector<NumPyOptimizationUnit> &exprs) {
+  std::unordered_map<id_t, NumPyExpr *> parsedValues;
+  for (auto &e : exprs) {
+    e.expr->apply([&](NumPyExpr &e) {
+      if (e.val)
+        parsedValues.emplace(e.val->getId(), &e);
+    });
+  }
+
+  ForwardingDAG dag;
+  int64_t dstId = 0;
+  for (auto &dst : exprs) {
+    auto *target = dst.expr.get();
+    auto &forwardingVec = dag[&dst];
+
+    std::vector<std::pair<Var *, NumPyExpr *>> vars;
+    target->apply([&](NumPyExpr &e) {
+      if (e.isLeaf()) {
+        if (auto *v = cast<VarValue>(e.val)) {
+          vars.emplace_back(v->getVar(), &e);
+        }
+      }
+    });
+
+    for (auto &p : vars) {
+      int64_t srcId = 0;
+      for (auto &src : exprs) {
+        if (srcId != dstId && src.assign && src.assign->getLhs() == p.first) {
+          auto checkFwdVar = canForwardVariable(src.assign, p.second->val, func, rd);
+          auto checkFwdExpr =
+              canForwardExpression(&src, p.second->val, parsedValues, cfg, se);
+          if (checkFwdVar && checkFwdExpr)
+            forwardingVec.push_back({&dst, &src, p.first, p.second, dstId, srcId});
+        }
+        ++srcId;
+      }
+    }
+    ++dstId;
+  }
+
+  return dag;
+}
+
+struct UnionFind {
+  std::vector<int64_t> parent;
+  std::vector<int64_t> rank;
+
+  explicit UnionFind(int64_t n) : parent(n), rank(n) {
+    for (auto i = 0; i < n; i++) {
+      parent[i] = i;
+      rank[i] = 0;
+    }
+  }
+
+  int64_t find(int64_t u) {
+    if (parent[u] != u)
+      parent[u] = find(parent[u]);
+    return parent[u];
+  }
+
+  void union_(int64_t u, int64_t v) {
+    auto ru = find(u);
+    auto rv = find(v);
+    if (ru != rv) {
+      if (rank[ru] > rank[rv]) {
+        parent[rv] = ru;
+      } else if (rank[ru] < rank[rv]) {
+        parent[ru] = rv;
+      } else {
+        parent[rv] = ru;
+        ++rank[ru];
+      }
+    }
+  }
+};
+
+std::vector<ForwardingDAG>
+getForwardingDAGConnectedComponents(ForwardingDAG &dag,
+                                    std::vector<NumPyOptimizationUnit> &exprs) {
+  auto n = exprs.size();
+  UnionFind uf(n);
+
+  for (auto i = 0; i < n; i++) {
+    for (auto &fwd : dag[&exprs[i]]) {
+      uf.union_(i, fwd.srcId);
+    }
+  }
+
+  std::vector<std::vector<NumPyOptimizationUnit *>> components(n);
+  for (auto i = 0; i < n; i++) {
+    auto root = uf.find(i);
+    components[root].push_back(&exprs[i]);
+  }
+
+  std::vector<ForwardingDAG> result;
+  for (auto &c : components) {
+    if (c.empty())
+      continue;
+
+    ForwardingDAG d;
+    for (auto *expr : c)
+      d.emplace(expr, dag[expr]);
+    result.push_back(d);
+  }
+
+  return result;
+}
+
+bool hasCycleHelper(int64_t v, ForwardingDAG &dag,
+                    std::vector<NumPyOptimizationUnit> &exprs,
+                    std::vector<bool> &visited, std::vector<bool> &recStack) {
+  visited[v] = true;
+  recStack[v] = true;
+
+  for (auto &neighbor : dag[&exprs[v]]) {
+    if (!visited[neighbor.srcId]) {
+      if (hasCycleHelper(neighbor.srcId, dag, exprs, visited, recStack))
+        return true;
+    } else if (recStack[neighbor.srcId]) {
+      return true;
+    }
+  }
+
+  recStack[v] = false;
+  return false;
+}
+
+bool hasCycle(ForwardingDAG &dag, std::vector<NumPyOptimizationUnit> &exprs) {
+  auto n = exprs.size();
+  std::vector<bool> visited(n, false);
+  std::vector<bool> recStack(n, false);
+
+  for (auto i = 0; i < n; i++) {
+    if (dag.find(&exprs[i]) != dag.end() && !visited[i] &&
+        hasCycleHelper(i, dag, exprs, visited, recStack))
+      return true;
+  }
+  return false;
+}
+
+void doForwardingHelper(ForwardingDAG &dag, NumPyOptimizationUnit *curr,
+                        std::unordered_set<NumPyOptimizationUnit *> &done,
+                        std::vector<AssignInstr *> &assignsToDelete) {
+  if (done.count(curr))
+    return;
+
+  auto forwardings = dag[curr];
+  for (auto &fwd : forwardings) {
+    doForwardingHelper(dag, fwd.src, done, assignsToDelete);
+    // Note that order of leaves here doesn't matter since they're guaranteed to have no
+    // side effects based on forwarding checks.
+    fwd.dst->leaves.insert(fwd.dst->leaves.end(), fwd.src->leaves.begin(),
+                           fwd.src->leaves.end());
+    fwd.dstLeaf->replace(*fwd.src->expr);
+    assignsToDelete.push_back(fwd.src->assign);
+  }
+
+  done.insert(curr);
+}
+} // namespace
+
+std::vector<ForwardingDAG>
+getForwardingDAGs(BodiedFunc *func, RD *rd, CFG *cfg, SE *se,
+                  std::vector<NumPyOptimizationUnit> &exprs) {
+  auto dag = buildForwardingDAG(func, rd, cfg, se, exprs);
+  auto dags = getForwardingDAGConnectedComponents(dag, exprs);
+  dags.erase(std::remove_if(dags.begin(), dags.end(),
+                            [&](ForwardingDAG &dag) { return hasCycle(dag, exprs); }),
+             dags.end());
+  return dags;
+}
+
+NumPyOptimizationUnit *doForwarding(ForwardingDAG &dag,
+                                    std::vector<AssignInstr *> &assignsToDelete) {
+  seqassertn(!dag.empty(), "empty forwarding DAG encountered");
+  std::unordered_set<NumPyOptimizationUnit *> done;
+  for (auto &e : dag) {
+    doForwardingHelper(dag, e.first, done, assignsToDelete);
+  }
+
+  // Find the root
+  std::unordered_set<NumPyOptimizationUnit *> notRoot;
+  for (auto &e : dag) {
+    for (auto &f : e.second) {
+      notRoot.insert(f.src);
+    }
+  }
+  seqassertn(notRoot.size() == dag.size() - 1,
+             "multiple roots found in forwarding DAG");
+
+  for (auto &e : dag) {
+    if (notRoot.count(e.first) == 0)
+      return e.first;
+  }
+
+  seqassertn(false, "could not find root in forwarding DAG");
+  return nullptr;
+}
+
+} // namespace numpy
+} // namespace transform
+} // namespace ir
+} // namespace codon
--- a/codon/cir/transform/numpy/numpy.cpp
+++ b/codon/cir/transform/numpy/numpy.cpp
@ -0,0 +1,877 @@
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>
+
+#include "numpy.h"
+
+#include "codon/cir/analyze/dataflow/reaching.h"
+#include "codon/cir/analyze/module/global_vars.h"
+#include "codon/cir/analyze/module/side_effect.h"
+#include "codon/cir/util/cloning.h"
+#include "codon/cir/util/irtools.h"
+
+#include "llvm/Support/CommandLine.h"
+
+#include <algorithm>
+#include <complex>
+#include <sstream>
+#include <utility>
+
+#define XLOG(c, ...)                                                                   \
+  do {                                                                                 \
+    if (Verbose)                                                                       \
+      LOG(c, ##__VA_ARGS__);                                                           \
+  } while (false)
+
+namespace codon {
+namespace ir {
+namespace transform {
+namespace numpy {
+namespace {
+llvm::cl::opt<int> AlwaysFuseCostThreshold(
+    "npfuse-always", llvm::cl::desc("Expression cost below which (<=) to always fuse"),
+    llvm::cl::init(10));
+
+llvm::cl::opt<int> NeverFuseCostThreshold(
+    "npfuse-never", llvm::cl::desc("Expression cost above which (>) to never fuse"),
+    llvm::cl::init(50));
+
+llvm::cl::opt<bool> Verbose("npfuse-verbose",
+                            llvm::cl::desc("Print information about fused expressions"),
+                            llvm::cl::init(false));
+
+bool isArrayType(types::Type *t) {
+  return t && isA<types::RecordType>(t) &&
+         t->getName().rfind("std.numpy.ndarray.ndarray[", 0) == 0;
+}
+
+bool isUFuncType(types::Type *t) {
+  return t && (t->getName().rfind("std.numpy.ufunc.UnaryUFunc[", 0) == 0 ||
+               t->getName().rfind("std.numpy.ufunc.BinaryUFunc[", 0) == 0);
+}
+
+bool isNoneType(types::Type *t, NumPyPrimitiveTypes &T) {
+  return t && (t->is(T.none) || t->is(T.optnone));
+}
+} // namespace
+
+const std::string FUSION_MODULE = "std.numpy.fusion";
+
+NumPyPrimitiveTypes::NumPyPrimitiveTypes(Module *M)
+    : none(M->getNoneType()), optnone(M->getOptionalType(none)),
+      bool_(M->getBoolType()), i8(M->getIntNType(8, true)),
+      u8(M->getIntNType(8, false)), i16(M->getIntNType(16, true)),
+      u16(M->getIntNType(16, false)), i32(M->getIntNType(32, true)),
+      u32(M->getIntNType(32, false)), i64(M->getIntType()),
+      u64(M->getIntNType(64, false)), f16(M->getFloat16Type()),
+      f32(M->getFloat32Type()), f64(M->getFloatType()),
+      c64(M->getType("std.internal.types.complex.complex64")),
+      c128(M->getType("std.internal.types.complex.complex")) {}
+
+NumPyType::NumPyType(Type dtype, int64_t ndim) : dtype(dtype), ndim(ndim) {
+  seqassertn(ndim >= 0, "ndim must be non-negative");
+}
+
+NumPyType::NumPyType() : NumPyType(NP_TYPE_NONE) {}
+
+NumPyType NumPyType::get(types::Type *t, NumPyPrimitiveTypes &T) {
+  if (t->is(T.bool_))
+    return {NumPyType::NP_TYPE_BOOL};
+  if (t->is(T.i8))
+    return {NumPyType::NP_TYPE_I8};
+  if (t->is(T.u8))
+    return {NumPyType::NP_TYPE_U8};
+  if (t->is(T.i16))
+    return {NumPyType::NP_TYPE_I16};
+  if (t->is(T.u16))
+    return {NumPyType::NP_TYPE_U16};
+  if (t->is(T.i32))
+    return {NumPyType::NP_TYPE_I32};
+  if (t->is(T.u32))
+    return {NumPyType::NP_TYPE_U32};
+  if (t->is(T.i64))
+    return {NumPyType::NP_TYPE_I64};
+  if (t->is(T.u64))
+    return {NumPyType::NP_TYPE_U64};
+  if (t->is(T.f16))
+    return {NumPyType::NP_TYPE_F16};
+  if (t->is(T.f32))
+    return {NumPyType::NP_TYPE_F32};
+  if (t->is(T.f64))
+    return {NumPyType::NP_TYPE_F64};
+  if (t->is(T.c64))
+    return {NumPyType::NP_TYPE_C64};
+  if (t->is(T.c128))
+    return {NumPyType::NP_TYPE_C128};
+  if (isArrayType(t)) {
+    auto generics = t->getGenerics();
+    seqassertn(generics.size() == 2 && generics[0].isType() && generics[1].isStatic(),
+               "unrecognized ndarray generics");
+    auto *dtype = generics[0].getTypeValue();
+    auto ndim = generics[1].getStaticValue();
+    if (dtype->is(T.bool_))
+      return {NumPyType::NP_TYPE_ARR_BOOL, ndim};
+    if (dtype->is(T.i8))
+      return {NumPyType::NP_TYPE_ARR_I8, ndim};
+    if (dtype->is(T.u8))
+      return {NumPyType::NP_TYPE_ARR_U8, ndim};
+    if (dtype->is(T.i16))
+      return {NumPyType::NP_TYPE_ARR_I16, ndim};
+    if (dtype->is(T.u16))
+      return {NumPyType::NP_TYPE_ARR_U16, ndim};
+    if (dtype->is(T.i32))
+      return {NumPyType::NP_TYPE_ARR_I32, ndim};
+    if (dtype->is(T.u32))
+      return {NumPyType::NP_TYPE_ARR_U32, ndim};
+    if (dtype->is(T.i64))
+      return {NumPyType::NP_TYPE_ARR_I64, ndim};
+    if (dtype->is(T.u64))
+      return {NumPyType::NP_TYPE_ARR_U64, ndim};
+    if (dtype->is(T.f16))
+      return {NumPyType::NP_TYPE_ARR_F16, ndim};
+    if (dtype->is(T.f32))
+      return {NumPyType::NP_TYPE_ARR_F32, ndim};
+    if (dtype->is(T.f64))
+      return {NumPyType::NP_TYPE_ARR_F64, ndim};
+    if (dtype->is(T.c64))
+      return {NumPyType::NP_TYPE_ARR_C64, ndim};
+    if (dtype->is(T.c128))
+      return {NumPyType::NP_TYPE_ARR_C128, ndim};
+  }
+  return {};
+}
+
+types::Type *NumPyType::getIRBaseType(NumPyPrimitiveTypes &T) const {
+  switch (dtype) {
+  case NP_TYPE_NONE:
+    seqassertn(false, "unexpected type code (NONE)");
+    return nullptr;
+  case NP_TYPE_BOOL:
+    return T.bool_;
+  case NP_TYPE_I8:
+    return T.i8;
+  case NP_TYPE_U8:
+    return T.u8;
+  case NP_TYPE_I16:
+    return T.i16;
+  case NP_TYPE_U16:
+    return T.u16;
+  case NP_TYPE_I32:
+    return T.i32;
+  case NP_TYPE_U32:
+    return T.u32;
+  case NP_TYPE_I64:
+    return T.i64;
+  case NP_TYPE_U64:
+    return T.u64;
+  case NP_TYPE_F16:
+    return T.f16;
+  case NP_TYPE_F32:
+    return T.f32;
+  case NP_TYPE_F64:
+    return T.f64;
+  case NP_TYPE_C64:
+    return T.c64;
+  case NP_TYPE_C128:
+    return T.c128;
+  case NP_TYPE_SCALAR_END:
+    seqassertn(false, "unexpected type code (SCALAR_END)");
+    return nullptr;
+  case NP_TYPE_ARR_BOOL:
+    return T.bool_;
+  case NP_TYPE_ARR_I8:
+    return T.i8;
+  case NP_TYPE_ARR_U8:
+    return T.u8;
+  case NP_TYPE_ARR_I16:
+    return T.i16;
+  case NP_TYPE_ARR_U16:
+    return T.u16;
+  case NP_TYPE_ARR_I32:
+    return T.i32;
+  case NP_TYPE_ARR_U32:
+    return T.u32;
+  case NP_TYPE_ARR_I64:
+    return T.i64;
+  case NP_TYPE_ARR_U64:
+    return T.u64;
+  case NP_TYPE_ARR_F16:
+    return T.f16;
+  case NP_TYPE_ARR_F32:
+    return T.f32;
+  case NP_TYPE_ARR_F64:
+    return T.f64;
+  case NP_TYPE_ARR_C64:
+    return T.c64;
+  case NP_TYPE_ARR_C128:
+    return T.c128;
+  default:
+    seqassertn(false, "unexpected type code (?)");
+    return nullptr;
+  }
+}
+
+std::ostream &operator<<(std::ostream &os, NumPyType const &type) {
+  static const std::unordered_map<NumPyType::Type, std::string> typestrings = {
+      {NumPyType::NP_TYPE_NONE, "none"},     {NumPyType::NP_TYPE_BOOL, "bool"},
+      {NumPyType::NP_TYPE_I8, "i8"},         {NumPyType::NP_TYPE_U8, "u8"},
+      {NumPyType::NP_TYPE_I16, "i16"},       {NumPyType::NP_TYPE_U16, "u16"},
+      {NumPyType::NP_TYPE_I32, "i32"},       {NumPyType::NP_TYPE_U32, "u32"},
+      {NumPyType::NP_TYPE_I64, "i64"},       {NumPyType::NP_TYPE_U64, "u64"},
+      {NumPyType::NP_TYPE_F16, "f16"},       {NumPyType::NP_TYPE_F32, "f32"},
+      {NumPyType::NP_TYPE_F64, "f64"},       {NumPyType::NP_TYPE_C64, "c64"},
+      {NumPyType::NP_TYPE_C128, "c128"},     {NumPyType::NP_TYPE_SCALAR_END, ""},
+      {NumPyType::NP_TYPE_ARR_BOOL, "bool"}, {NumPyType::NP_TYPE_ARR_I8, "i8"},
+      {NumPyType::NP_TYPE_ARR_U8, "u8"},     {NumPyType::NP_TYPE_ARR_I16, "i16"},
+      {NumPyType::NP_TYPE_ARR_U16, "u16"},   {NumPyType::NP_TYPE_ARR_I32, "i32"},
+      {NumPyType::NP_TYPE_ARR_U32, "u32"},   {NumPyType::NP_TYPE_ARR_I64, "i64"},
+      {NumPyType::NP_TYPE_ARR_U64, "u64"},   {NumPyType::NP_TYPE_ARR_F16, "f16"},
+      {NumPyType::NP_TYPE_ARR_F32, "f32"},   {NumPyType::NP_TYPE_ARR_F64, "f64"},
+      {NumPyType::NP_TYPE_ARR_C64, "c64"},   {NumPyType::NP_TYPE_ARR_C128, "c128"},
+  };
+
+  auto it = typestrings.find(type.dtype);
+  seqassertn(it != typestrings.end(), "type not found");
+  auto s = it->second;
+  if (type.isArray())
+    os << "array[" << s << ", " << type.ndim << "]";
+  else
+    os << s;
+  return os;
+}
+
+std::string NumPyType::str() const {
+  std::stringstream buffer;
+  buffer << *this;
+  return buffer.str();
+}
+
+CodegenContext::CodegenContext(Module *M, SeriesFlow *series, BodiedFunc *func,
+                               NumPyPrimitiveTypes &T)
+    : M(M), series(series), func(func), vars(), T(T) {}
+
+std::unique_ptr<NumPyExpr> parse(Value *v,
+                                 std::vector<std::pair<NumPyExpr *, Value *>> &leaves,
+                                 NumPyPrimitiveTypes &T) {
+  struct NumPyMagicMethod {
+    std::string name;
+    NumPyExpr::Op op;
+    int args;
+    bool right;
+  };
+
+  struct NumPyUFunc {
+    std::string name;
+    NumPyExpr::Op op;
+    int args;
+  };
+
+  static std::vector<NumPyMagicMethod> magics = {
+      {Module::POS_MAGIC_NAME, NumPyExpr::NP_OP_POS, 1, false},
+      {Module::NEG_MAGIC_NAME, NumPyExpr::NP_OP_NEG, 1, false},
+      {Module::INVERT_MAGIC_NAME, NumPyExpr::NP_OP_INVERT, 1, false},
+      {Module::ABS_MAGIC_NAME, NumPyExpr::NP_OP_ABS, 1, false},
+
+      {Module::ADD_MAGIC_NAME, NumPyExpr::NP_OP_ADD, 2, false},
+      {Module::SUB_MAGIC_NAME, NumPyExpr::NP_OP_SUB, 2, false},
+      {Module::MUL_MAGIC_NAME, NumPyExpr::NP_OP_MUL, 2, false},
+      {Module::MATMUL_MAGIC_NAME, NumPyExpr::NP_OP_MATMUL, 2, false},
+      {Module::TRUE_DIV_MAGIC_NAME, NumPyExpr::NP_OP_TRUE_DIV, 2, false},
+      {Module::FLOOR_DIV_MAGIC_NAME, NumPyExpr::NP_OP_FLOOR_DIV, 2, false},
+      {Module::MOD_MAGIC_NAME, NumPyExpr::NP_OP_MOD, 2, false},
+      {Module::POW_MAGIC_NAME, NumPyExpr::NP_OP_POW, 2, false},
+      {Module::LSHIFT_MAGIC_NAME, NumPyExpr::NP_OP_LSHIFT, 2, false},
+      {Module::RSHIFT_MAGIC_NAME, NumPyExpr::NP_OP_RSHIFT, 2, false},
+      {Module::AND_MAGIC_NAME, NumPyExpr::NP_OP_AND, 2, false},
+      {Module::OR_MAGIC_NAME, NumPyExpr::NP_OP_OR, 2, false},
+      {Module::XOR_MAGIC_NAME, NumPyExpr::NP_OP_XOR, 2, false},
+
+      {Module::RADD_MAGIC_NAME, NumPyExpr::NP_OP_ADD, 2, true},
+      {Module::RSUB_MAGIC_NAME, NumPyExpr::NP_OP_SUB, 2, true},
+      {Module::RMUL_MAGIC_NAME, NumPyExpr::NP_OP_MUL, 2, true},
+      {Module::RMATMUL_MAGIC_NAME, NumPyExpr::NP_OP_MATMUL, 2, true},
+      {Module::RTRUE_DIV_MAGIC_NAME, NumPyExpr::NP_OP_TRUE_DIV, 2, true},
+      {Module::RFLOOR_DIV_MAGIC_NAME, NumPyExpr::NP_OP_FLOOR_DIV, 2, true},
+      {Module::RMOD_MAGIC_NAME, NumPyExpr::NP_OP_MOD, 2, true},
+      {Module::RPOW_MAGIC_NAME, NumPyExpr::NP_OP_POW, 2, true},
+      {Module::RLSHIFT_MAGIC_NAME, NumPyExpr::NP_OP_LSHIFT, 2, true},
+      {Module::RRSHIFT_MAGIC_NAME, NumPyExpr::NP_OP_RSHIFT, 2, true},
+      {Module::RAND_MAGIC_NAME, NumPyExpr::NP_OP_AND, 2, true},
+      {Module::ROR_MAGIC_NAME, NumPyExpr::NP_OP_OR, 2, true},
+      {Module::RXOR_MAGIC_NAME, NumPyExpr::NP_OP_XOR, 2, true},
+
+      {Module::EQ_MAGIC_NAME, NumPyExpr::NP_OP_EQ, 2, false},
+      {Module::NE_MAGIC_NAME, NumPyExpr::NP_OP_NE, 2, false},
+      {Module::LT_MAGIC_NAME, NumPyExpr::NP_OP_LT, 2, false},
+      {Module::LE_MAGIC_NAME, NumPyExpr::NP_OP_LE, 2, false},
+      {Module::GT_MAGIC_NAME, NumPyExpr::NP_OP_GT, 2, false},
+      {Module::GE_MAGIC_NAME, NumPyExpr::NP_OP_GE, 2, false},
+  };
+
+  static std::vector<NumPyUFunc> ufuncs = {
+      {"positive", NumPyExpr::NP_OP_POS, 1},
+      {"negative", NumPyExpr::NP_OP_NEG, 1},
+      {"invert", NumPyExpr::NP_OP_INVERT, 1},
+      {"abs", NumPyExpr::NP_OP_ABS, 1},
+      {"absolute", NumPyExpr::NP_OP_ABS, 1},
+      {"add", NumPyExpr::NP_OP_ADD, 2},
+      {"subtract", NumPyExpr::NP_OP_SUB, 2},
+      {"multiply", NumPyExpr::NP_OP_MUL, 2},
+      {"divide", NumPyExpr::NP_OP_TRUE_DIV, 2},
+      {"floor_divide", NumPyExpr::NP_OP_FLOOR_DIV, 2},
+      {"remainder", NumPyExpr::NP_OP_MOD, 2},
+      {"fmod", NumPyExpr::NP_OP_FMOD, 2},
+      {"power", NumPyExpr::NP_OP_POW, 2},
+      {"left_shift", NumPyExpr::NP_OP_LSHIFT, 2},
+      {"right_shift", NumPyExpr::NP_OP_RSHIFT, 2},
+      {"bitwise_and", NumPyExpr::NP_OP_AND, 2},
+      {"bitwise_or", NumPyExpr::NP_OP_OR, 2},
+      {"bitwise_xor", NumPyExpr::NP_OP_XOR, 2},
+      {"logical_and", NumPyExpr::NP_OP_LOGICAL_AND, 2},
+      {"logical_or", NumPyExpr::NP_OP_LOGICAL_OR, 2},
+      {"logical_xor", NumPyExpr::NP_OP_LOGICAL_XOR, 2},
+      {"equal", NumPyExpr::NP_OP_EQ, 2},
+      {"not_equal", NumPyExpr::NP_OP_NE, 2},
+      {"less", NumPyExpr::NP_OP_LT, 2},
+      {"less_equal", NumPyExpr::NP_OP_LE, 2},
+      {"greater", NumPyExpr::NP_OP_GT, 2},
+      {"greater_equal", NumPyExpr::NP_OP_GE, 2},
+      {"minimum", NumPyExpr::NP_OP_MIN, 2},
+      {"maximum", NumPyExpr::NP_OP_MAX, 2},
+      {"fmin", NumPyExpr::NP_OP_FMIN, 2},
+      {"fmax", NumPyExpr::NP_OP_FMAX, 2},
+      {"sin", NumPyExpr::NP_OP_SIN, 1},
+      {"cos", NumPyExpr::NP_OP_COS, 1},
+      {"tan", NumPyExpr::NP_OP_TAN, 1},
+      {"arcsin", NumPyExpr::NP_OP_ARCSIN, 1},
+      {"arccos", NumPyExpr::NP_OP_ARCCOS, 1},
+      {"arctan", NumPyExpr::NP_OP_ARCTAN, 1},
+      {"arctan2", NumPyExpr::NP_OP_ARCTAN2, 2},
+      {"hypot", NumPyExpr::NP_OP_HYPOT, 2},
+      {"sinh", NumPyExpr::NP_OP_SINH, 1},
+      {"cosh", NumPyExpr::NP_OP_COSH, 1},
+      {"tanh", NumPyExpr::NP_OP_TANH, 1},
+      {"arcsinh", NumPyExpr::NP_OP_ARCSINH, 1},
+      {"arccosh", NumPyExpr::NP_OP_ARCCOSH, 1},
+      {"arctanh", NumPyExpr::NP_OP_ARCTANH, 1},
+      {"conjugate", NumPyExpr::NP_OP_CONJ, 1},
+      {"exp", NumPyExpr::NP_OP_EXP, 1},
+      {"exp2", NumPyExpr::NP_OP_EXP2, 1},
+      {"log", NumPyExpr::NP_OP_LOG, 1},
+      {"log2", NumPyExpr::NP_OP_LOG2, 1},
+      {"log10", NumPyExpr::NP_OP_LOG10, 1},
+      {"expm1", NumPyExpr::NP_OP_EXPM1, 1},
+      {"log1p", NumPyExpr::NP_OP_LOG1P, 1},
+      {"sqrt", NumPyExpr::NP_OP_SQRT, 1},
+      {"square", NumPyExpr::NP_OP_SQUARE, 1},
+      {"cbrt", NumPyExpr::NP_OP_CBRT, 1},
+      {"logaddexp", NumPyExpr::NP_OP_LOGADDEXP, 2},
+      {"logaddexp2", NumPyExpr::NP_OP_LOGADDEXP2, 2},
+      {"reciprocal", NumPyExpr::NP_OP_RECIPROCAL, 1},
+      {"rint", NumPyExpr::NP_OP_RINT, 1},
+      {"floor", NumPyExpr::NP_OP_FLOOR, 1},
+      {"ceil", NumPyExpr::NP_OP_CEIL, 1},
+      {"trunc", NumPyExpr::NP_OP_TRUNC, 1},
+      {"isnan", NumPyExpr::NP_OP_ISNAN, 1},
+      {"isinf", NumPyExpr::NP_OP_ISINF, 1},
+      {"isfinite", NumPyExpr::NP_OP_ISFINITE, 1},
+      {"sign", NumPyExpr::NP_OP_SIGN, 1},
+      {"signbit", NumPyExpr::NP_OP_SIGNBIT, 1},
+      {"copysign", NumPyExpr::NP_OP_COPYSIGN, 2},
+      {"spacing", NumPyExpr::NP_OP_SPACING, 1},
+      {"nextafter", NumPyExpr::NP_OP_NEXTAFTER, 2},
+      {"deg2rad", NumPyExpr::NP_OP_DEG2RAD, 1},
+      {"radians", NumPyExpr::NP_OP_DEG2RAD, 1},
+      {"rad2deg", NumPyExpr::NP_OP_RAD2DEG, 1},
+      {"degrees", NumPyExpr::NP_OP_RAD2DEG, 1},
+      {"heaviside", NumPyExpr::NP_OP_HEAVISIDE, 2},
+  };
+
+  auto getNumPyExprType = [](types::Type *t, NumPyPrimitiveTypes &T) -> NumPyType {
+    if (t->is(T.bool_))
+      return {NumPyType::NP_TYPE_BOOL};
+    if (t->is(T.i8))
+      return {NumPyType::NP_TYPE_I8};
+    if (t->is(T.u8))
+      return {NumPyType::NP_TYPE_U8};
+    if (t->is(T.i16))
+      return {NumPyType::NP_TYPE_I16};
+    if (t->is(T.u16))
+      return {NumPyType::NP_TYPE_U16};
+    if (t->is(T.i32))
+      return {NumPyType::NP_TYPE_I32};
+    if (t->is(T.u32))
+      return {NumPyType::NP_TYPE_U32};
+    if (t->is(T.i64))
+      return {NumPyType::NP_TYPE_I64};
+    if (t->is(T.u64))
+      return {NumPyType::NP_TYPE_U64};
+    if (t->is(T.f16))
+      return {NumPyType::NP_TYPE_F16};
+    if (t->is(T.f32))
+      return {NumPyType::NP_TYPE_F32};
+    if (t->is(T.f64))
+      return {NumPyType::NP_TYPE_F64};
+    if (t->is(T.c64))
+      return {NumPyType::NP_TYPE_C64};
+    if (t->is(T.c128))
+      return {NumPyType::NP_TYPE_C128};
+    if (isArrayType(t)) {
+      auto generics = t->getGenerics();
+      seqassertn(generics.size() == 2 && generics[0].isType() && generics[1].isStatic(),
+                 "unrecognized ndarray generics");
+      auto *dtype = generics[0].getTypeValue();
+      auto ndim = generics[1].getStaticValue();
+      if (dtype->is(T.bool_))
+        return {NumPyType::NP_TYPE_ARR_BOOL, ndim};
+      if (dtype->is(T.i8))
+        return {NumPyType::NP_TYPE_ARR_I8, ndim};
+      if (dtype->is(T.u8))
+        return {NumPyType::NP_TYPE_ARR_U8, ndim};
+      if (dtype->is(T.i16))
+        return {NumPyType::NP_TYPE_ARR_I16, ndim};
+      if (dtype->is(T.u16))
+        return {NumPyType::NP_TYPE_ARR_U16, ndim};
+      if (dtype->is(T.i32))
+        return {NumPyType::NP_TYPE_ARR_I32, ndim};
+      if (dtype->is(T.u32))
+        return {NumPyType::NP_TYPE_ARR_U32, ndim};
+      if (dtype->is(T.i64))
+        return {NumPyType::NP_TYPE_ARR_I64, ndim};
+      if (dtype->is(T.u64))
+        return {NumPyType::NP_TYPE_ARR_U64, ndim};
+      if (dtype->is(T.f16))
+        return {NumPyType::NP_TYPE_ARR_F16, ndim};
+      if (dtype->is(T.f32))
+        return {NumPyType::NP_TYPE_ARR_F32, ndim};
+      if (dtype->is(T.f64))
+        return {NumPyType::NP_TYPE_ARR_F64, ndim};
+      if (dtype->is(T.c64))
+        return {NumPyType::NP_TYPE_ARR_C64, ndim};
+      if (dtype->is(T.c128))
+        return {NumPyType::NP_TYPE_ARR_C128, ndim};
+    }
+    return {};
+  };
+
+  auto type = getNumPyExprType(v->getType(), T);
+  if (!type)
+    return {};
+
+  // Don't break up expressions that result in scalars or 0-dim arrays since those
+  // should only be computed once
+  if (type.ndim == 0) {
+    auto res = std::make_unique<NumPyExpr>(type, v);
+    leaves.emplace_back(res.get(), v);
+    return std::move(res);
+  }
+
+  if (auto *c = cast<CallInstr>(v)) {
+    auto *f = util::getFunc(c->getCallee());
+
+    // Check for matmul
+    if (f && c->numArgs() == 3 && isNoneType(c->back()->getType(), T) &&
+        (f->getName().rfind("std.numpy.linalg_sym.matmul:0[", 0) == 0 ||
+         (f->getName().rfind("std.numpy.linalg_sym.dot:0[", 0) == 0 &&
+          type.ndim == 2))) {
+      std::vector<Value *> args(c->begin(), c->end());
+      auto op = NumPyExpr::NP_OP_MATMUL;
+      auto lhs = parse(args[0], leaves, T);
+      if (!lhs)
+        return {};
+
+      auto rhs = parse(args[1], leaves, T);
+      if (!rhs)
+        return {};
+
+      return std::make_unique<NumPyExpr>(type, v, op, std::move(lhs), std::move(rhs));
+    }
+
+    // Check for builtin abs()
+    if (f && c->numArgs() == 1 &&
+        (f->getName().rfind("std.internal.builtin.abs:0[", 0) == 0)) {
+      auto op = NumPyExpr::NP_OP_ABS;
+      auto lhs = parse(c->front(), leaves, T);
+      if (!lhs)
+        return {};
+
+      return std::make_unique<NumPyExpr>(type, v, op, std::move(lhs));
+    }
+
+    // Check for transpose
+    if (f && isArrayType(f->getParentType()) && c->numArgs() == 1 &&
+        f->getUnmangledName() == "T") {
+      auto op = NumPyExpr::NP_OP_TRANSPOSE;
+      auto lhs = parse(c->front(), leaves, T);
+      if (!lhs)
+        return {};
+
+      return std::make_unique<NumPyExpr>(type, v, op, std::move(lhs));
+    }
+
+    // Check for ufunc (e.g. "np.exp()") call
+    if (f && f->getUnmangledName() == Module::CALL_MAGIC_NAME &&
+        isUFuncType(f->getParentType())) {
+
+      auto ufuncGenerics = f->getParentType()->getGenerics();
+      seqassertn(!ufuncGenerics.empty() && ufuncGenerics[0].isStaticStr(),
+                 "unrecognized ufunc class generics");
+      auto ufunc = ufuncGenerics[0].getStaticStringValue();
+
+      auto callGenerics = f->getType()->getGenerics();
+      seqassertn(!callGenerics.empty() && callGenerics[0].isType(),
+                 "unrecognized ufunc call generics");
+      auto *dtype = callGenerics[0].getTypeValue();
+
+      if (dtype->is(T.none)) {
+        for (auto &u : ufuncs) {
+          if (u.name == ufunc) {
+            seqassertn(u.args == 1 || u.args == 2,
+                       "unexpected number of arguments (ufunc)");
+
+            // Argument order:
+            //   - ufunc self
+            //   - operand 1
+            //   - (if binary) operand 2
+            //   - 'out'
+            //   - 'where'
+            std::vector<Value *> args(c->begin(), c->end());
+            seqassertn(args.size() == u.args + 3, "unexpected call of {}", u.name);
+            auto *where = args[args.size() - 1];
+            auto *out = args[args.size() - 2];
+
+            if (auto *whereConst = cast<BoolConst>(where)) {
+              if (!whereConst->getVal())
+                break;
+            } else {
+              break;
+            }
+
+            if (!isNoneType(out->getType(), T))
+              break;
+
+            auto op = u.op;
+            auto lhs = parse(args[1], leaves, T);
+            if (!lhs)
+              return {};
+
+            if (u.args == 1)
+              return std::make_unique<NumPyExpr>(type, v, op, std::move(lhs));
+
+            auto rhs = parse(args[2], leaves, T);
+            if (!rhs)
+              return {};
+
+            return std::make_unique<NumPyExpr>(type, v, op, std::move(lhs),
+                                               std::move(rhs));
+          }
+        }
+      }
+    }
+
+    // Check for magic method call
+    if (f && isArrayType(f->getParentType())) {
+      for (auto &m : magics) {
+        if (f->getUnmangledName() == m.name && c->numArgs() == m.args) {
+          seqassertn(m.args == 1 || m.args == 2,
+                     "unexpected number of arguments (magic)");
+          std::vector<Value *> args(c->begin(), c->end());
+          auto op = m.op;
+          auto lhs = parse(args[0], leaves, T);
+          if (!lhs)
+            return {};
+
+          if (m.args == 1)
+            return std::make_unique<NumPyExpr>(type, v, op, std::move(lhs));
+
+          auto rhs = parse(args[1], leaves, T);
+          if (!rhs)
+            return {};
+
+          return m.right ? std::make_unique<NumPyExpr>(type, v, op, std::move(rhs),
+                                                       std::move(lhs))
+                         : std::make_unique<NumPyExpr>(type, v, op, std::move(lhs),
+                                                       std::move(rhs));
+        }
+      }
+    }
+  }
+
+  // Check for right-hand-side magic method call
+  // Right-hand-side magics (e.g. __radd__) are compiled into FlowInstr:
+  //   <lhs_expr> + <rhs_expr>
+  // becomes:
+  //   { v1 = <lhs expr> ; v2 = <rhs expr> ; return rhs_class.__radd__(v2, v1) }
+  // So we need to check for this to detect r-magics.
+  if (auto *flow = cast<FlowInstr>(v)) {
+    auto *series = cast<SeriesFlow>(flow->getFlow());
+    auto *value = cast<CallInstr>(flow->getValue());
+    auto *f = value ? util::getFunc(value->getCallee()) : nullptr;
+
+    if (series && f && value->numArgs() == 2) {
+      std::vector<Value *> assignments(series->begin(), series->end());
+      auto *arg1 = value->front();
+      auto *arg2 = value->back();
+      auto *vv1 = cast<VarValue>(arg1);
+      auto *vv2 = cast<VarValue>(arg2);
+      auto *arg1Var = vv1 ? vv1->getVar() : nullptr;
+      auto *arg2Var = vv2 ? vv2->getVar() : nullptr;
+
+      for (auto &m : magics) {
+        if (f->getUnmangledName() == m.name && value->numArgs() == m.args && m.right) {
+          auto op = m.op;
+
+          if (assignments.size() == 0) {
+            // Case 1: Degenerate flow instruction
+            return parse(value, leaves, T);
+          } else if (assignments.size() == 1) {
+            // Case 2: One var -- check if it's either of the r-magic operands
+            auto *a1 = cast<AssignInstr>(assignments.front());
+            if (a1 && a1->getLhs() == arg1Var) {
+              auto rhs = parse(a1->getRhs(), leaves, T);
+              if (!rhs)
+                return {};
+
+              auto lhs = parse(arg2, leaves, T);
+              if (!lhs)
+                return {};
+
+              return std::make_unique<NumPyExpr>(type, v, op, std::move(lhs),
+                                                 std::move(rhs));
+            } else if (a1 && a1->getLhs() == arg2Var) {
+              auto lhs = parse(a1->getRhs(), leaves, T);
+              if (!lhs)
+                return {};
+
+              auto rhs = parse(arg1, leaves, T);
+              if (!rhs)
+                return {};
+
+              return std::make_unique<NumPyExpr>(type, v, op, std::move(lhs),
+                                                 std::move(rhs));
+            }
+          } else if (assignments.size() == 2) {
+            // Case 2: Two vars -- check both permutations
+            auto *a1 = cast<AssignInstr>(assignments.front());
+            auto *a2 = cast<AssignInstr>(assignments.back());
+
+            if (a1 && a2 && a1->getLhs() == arg1Var && a2->getLhs() == arg2Var) {
+              auto rhs = parse(a1->getRhs(), leaves, T);
+              if (!rhs)
+                return {};
+
+              auto lhs = parse(a2->getRhs(), leaves, T);
+              if (!lhs)
+                return {};
+
+              return std::make_unique<NumPyExpr>(type, v, op, std::move(lhs),
+                                                 std::move(rhs));
+            } else if (a1 && a2 && a2->getLhs() == arg1Var && a1->getLhs() == arg2Var) {
+              auto lhs = parse(a1->getRhs(), leaves, T);
+              if (!lhs)
+                return {};
+
+              auto rhs = parse(a2->getRhs(), leaves, T);
+              if (!rhs)
+                return {};
+
+              return std::make_unique<NumPyExpr>(type, v, op, std::move(lhs),
+                                                 std::move(rhs));
+            }
+          }
+          break;
+        }
+      }
+    }
+  }
+
+  auto res = std::make_unique<NumPyExpr>(type, v);
+  leaves.emplace_back(res.get(), v);
+  return std::move(res);
+}
+
+namespace {
+Var *optimizeHelper(NumPyOptimizationUnit &unit, NumPyExpr *expr, CodegenContext &C) {
+  auto *M = unit.value->getModule();
+  auto *series = C.series;
+
+  // Remove some operations that cannot be done element-wise easily by optimizing them
+  // separately, recursively.
+  expr->apply([&](NumPyExpr &e) {
+    if (!e.type.isArray())
+      return;
+
+    if (e.op == NumPyExpr::NP_OP_TRANSPOSE) {
+      auto *lv = optimizeHelper(unit, e.lhs.get(), C);
+      auto *transposeFunc =
+          M->getOrRealizeFunc("_transpose", {lv->getType()}, {}, FUSION_MODULE);
+      seqassertn(transposeFunc, "transpose func not found");
+      auto *var = util::makeVar(util::call(transposeFunc, {M->Nr<VarValue>(lv)}),
+                                C.series, C.func);
+      C.vars[&e] = var;
+      NumPyExpr replacement(e.type, M->Nr<VarValue>(var));
+      replacement.freeable = e.lhs->freeable;
+      e.replace(replacement);
+    }
+
+    if (e.op == NumPyExpr::NP_OP_MATMUL) {
+      auto *lv = optimizeHelper(unit, e.lhs.get(), C);
+      auto *rv = optimizeHelper(unit, e.rhs.get(), C);
+      auto *matmulFunc = M->getOrRealizeFunc("_matmul", {lv->getType(), rv->getType()},
+                                             {}, FUSION_MODULE);
+      seqassertn(matmulFunc, "matmul func not found");
+      auto *var = util::makeVar(
+          util::call(matmulFunc, {M->Nr<VarValue>(lv), M->Nr<VarValue>(rv)}), C.series,
+          C.func);
+      C.vars[&e] = var;
+      NumPyExpr replacement(e.type, M->Nr<VarValue>(var));
+      replacement.freeable = true;
+      e.replace(replacement);
+    }
+  });
+
+  // Optimize the given expression
+  bool changed;
+  do {
+    changed = false;
+    expr->apply([&](NumPyExpr &e) {
+      if (e.depth() <= 2)
+        return;
+
+      auto cost = e.cost();
+      auto bcinfo = e.getBroadcastInfo();
+      Var *result = nullptr;
+
+      if (cost <= AlwaysFuseCostThreshold ||
+          (cost <= NeverFuseCostThreshold && bcinfo == BroadcastInfo::NO)) {
+        // Don't care about broadcasting; just fuse.
+        XLOG("-> static fuse:\n{}", e.str());
+        result = e.codegenFusedEval(C);
+      } else if (cost <= NeverFuseCostThreshold && bcinfo != BroadcastInfo::YES) {
+        // Check at runtime if we're broadcasting and fuse conditionally.
+        XLOG("-> conditional fuse:\n{}", e.str());
+        auto *broadcasts = e.codegenBroadcasts(C);
+        auto *seqtSeries = M->Nr<SeriesFlow>();
+        auto *fuseSeries = M->Nr<SeriesFlow>();
+        auto *branch = M->Nr<IfFlow>(broadcasts, seqtSeries, fuseSeries);
+
+        C.series = seqtSeries;
+        auto *seqtResult = e.codegenSequentialEval(C);
+        C.series = fuseSeries;
+        auto *fuseResult = e.codegenFusedEval(C);
+        seqassertn(seqtResult->getType()->is(fuseResult->getType()),
+                   "types are not the same: {} {}", seqtResult->getType()->getName(),
+                   fuseResult->getType()->getName());
+
+        result = M->Nr<Var>(seqtResult->getType(), false);
+        unit.func->push_back(result);
+        seqtSeries->push_back(M->Nr<AssignInstr>(result, M->Nr<VarValue>(seqtResult)));
+        fuseSeries->push_back(M->Nr<AssignInstr>(result, M->Nr<VarValue>(fuseResult)));
+        C.series = series;
+        series->push_back(branch);
+      }
+
+      if (result) {
+        NumPyExpr tmp(e.type, M->Nr<VarValue>(result));
+        e.replace(tmp);
+        e.freeable = true;
+        C.vars[&e] = result;
+        changed = true;
+      }
+    });
+  } while (changed);
+
+  XLOG("-> sequential eval:\n{}", expr->str());
+  return expr->codegenSequentialEval(C);
+}
+} // namespace
+
+bool NumPyOptimizationUnit::optimize(NumPyPrimitiveTypes &T) {
+  if (!expr->type.isArray() || expr->depth() <= 2)
+    return false;
+
+  XLOG("Optimizing expression at {}\n{}", value->getSrcInfo(), expr->str());
+
+  auto *M = value->getModule();
+  auto *series = M->Nr<SeriesFlow>();
+  CodegenContext C(M, series, func, T);
+  util::CloneVisitor cv(M);
+
+  for (auto &p : leaves) {
+    auto *var = util::makeVar(cv.clone(p.second), series, func);
+    C.vars.emplace(p.first, var);
+  }
+
+  auto *result = optimizeHelper(*this, expr.get(), C);
+  auto *replacement = M->Nr<FlowInstr>(C.series, M->Nr<VarValue>(result));
+  value->replaceAll(replacement);
+  return true;
+}
+
+struct ExtractArrayExpressions : public util::Operator {
+  BodiedFunc *func;
+  NumPyPrimitiveTypes types;
+  std::vector<NumPyOptimizationUnit> exprs;
+  std::unordered_set<id_t> extracted;
+
+  explicit ExtractArrayExpressions(BodiedFunc *func)
+      : util::Operator(), func(func), types(func->getModule()), exprs(), extracted() {}
+
+  void extract(Value *v, AssignInstr *assign = nullptr) {
+    if (extracted.count(v->getId()))
+      return;
+
+    std::vector<std::pair<NumPyExpr *, Value *>> leaves;
+    auto expr = parse(v, leaves, types);
+    if (expr) {
+      int64_t numArrayNodes = 0;
+      expr->apply([&](NumPyExpr &e) {
+        if (e.type.isArray())
+          ++numArrayNodes;
+        extracted.emplace(e.val->getId());
+      });
+      if (numArrayNodes > 0 && expr->depth() > 1) {
+        exprs.push_back({v, func, std::move(expr), std::move(leaves), assign});
+      }
+    }
+  }
+
+  void preHook(Node *n) override {
+    if (auto *v = cast<AssignInstr>(n)) {
+      extract(v->getRhs(), v->getLhs()->isGlobal() ? nullptr : v);
+    } else if (auto *v = cast<Value>(n)) {
+      extract(v);
+    }
+  }
+};
+
+const std::string NumPyFusionPass::KEY = "core-numpy-fusion";
+
+void NumPyFusionPass::visit(BodiedFunc *func) {
+  ExtractArrayExpressions extractor(func);
+  func->accept(extractor);
+
+  if (extractor.exprs.empty())
+    return;
+
+  auto *rdres = getAnalysisResult<analyze::dataflow::RDResult>(reachingDefKey);
+  auto it = rdres->results.find(func->getId());
+  if (it == rdres->results.end())
+    return;
+  auto *rd = it->second.get();
+  auto *se = getAnalysisResult<analyze::module::SideEffectResult>(sideEffectsKey);
+  auto *cfg = rdres->cfgResult->graphs.find(func->getId())->second.get();
+  auto fwd = getForwardingDAGs(func, rd, cfg, se, extractor.exprs);
+
+  for (auto &dag : fwd) {
+    std::vector<AssignInstr *> assignsToDelete;
+    auto *e = doForwarding(dag, assignsToDelete);
+    if (e->optimize(extractor.types)) {
+      for (auto *a : assignsToDelete)
+        a->replaceAll(func->getModule()->Nr<SeriesFlow>());
+    }
+  }
+}
+
+} // namespace numpy
+} // namespace transform
+} // namespace ir
+} // namespace codon
--- a/codon/cir/transform/numpy/numpy.h
+++ b/codon/cir/transform/numpy/numpy.h
@ -0,0 +1,313 @@
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>
+
+#pragma once
+
+#include "codon/cir/analyze/dataflow/reaching.h"
+#include "codon/cir/analyze/module/global_vars.h"
+#include "codon/cir/analyze/module/side_effect.h"
+#include "codon/cir/transform/pass.h"
+#include "codon/cir/types/types.h"
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+namespace codon {
+namespace ir {
+namespace transform {
+namespace numpy {
+extern const std::string FUSION_MODULE;
+
+/// NumPy operator fusion pass.
+class NumPyFusionPass : public OperatorPass {
+private:
+  /// Key of the reaching definition analysis
+  std::string reachingDefKey;
+  /// Key of the side effect analysis
+  std::string sideEffectsKey;
+
+public:
+  static const std::string KEY;
+
+  /// Constructs a NumPy fusion pass.
+  /// @param reachingDefKey the reaching definition analysis' key
+  /// @param sideEffectsKey side effect analysis' key
+  NumPyFusionPass(const std::string &reachingDefKey, const std::string &sideEffectsKey)
+      : OperatorPass(), reachingDefKey(reachingDefKey), sideEffectsKey(sideEffectsKey) {
+  }
+
+  std::string getKey() const override { return KEY; }
+  void visit(BodiedFunc *f) override;
+};
+
+struct NumPyPrimitiveTypes {
+  types::Type *none;
+  types::Type *optnone;
+  types::Type *bool_;
+  types::Type *i8;
+  types::Type *u8;
+  types::Type *i16;
+  types::Type *u16;
+  types::Type *i32;
+  types::Type *u32;
+  types::Type *i64;
+  types::Type *u64;
+  types::Type *f16;
+  types::Type *f32;
+  types::Type *f64;
+  types::Type *c64;
+  types::Type *c128;
+
+  explicit NumPyPrimitiveTypes(Module *M);
+};
+
+struct NumPyType {
+  enum Type {
+    NP_TYPE_NONE = -1,
+    NP_TYPE_BOOL,
+    NP_TYPE_I8,
+    NP_TYPE_U8,
+    NP_TYPE_I16,
+    NP_TYPE_U16,
+    NP_TYPE_I32,
+    NP_TYPE_U32,
+    NP_TYPE_I64,
+    NP_TYPE_U64,
+    NP_TYPE_F16,
+    NP_TYPE_F32,
+    NP_TYPE_F64,
+    NP_TYPE_C64,
+    NP_TYPE_C128,
+    NP_TYPE_SCALAR_END, // separator value
+    NP_TYPE_ARR_BOOL,
+    NP_TYPE_ARR_I8,
+    NP_TYPE_ARR_U8,
+    NP_TYPE_ARR_I16,
+    NP_TYPE_ARR_U16,
+    NP_TYPE_ARR_I32,
+    NP_TYPE_ARR_U32,
+    NP_TYPE_ARR_I64,
+    NP_TYPE_ARR_U64,
+    NP_TYPE_ARR_F16,
+    NP_TYPE_ARR_F32,
+    NP_TYPE_ARR_F64,
+    NP_TYPE_ARR_C64,
+    NP_TYPE_ARR_C128,
+  } dtype;
+  int64_t ndim;
+
+  NumPyType(Type dtype, int64_t ndim = 0);
+  NumPyType();
+
+  static NumPyType get(types::Type *t, NumPyPrimitiveTypes &T);
+
+  types::Type *getIRBaseType(NumPyPrimitiveTypes &T) const;
+
+  operator bool() const { return dtype != NP_TYPE_NONE; }
+  bool isArray() const { return dtype > NP_TYPE_SCALAR_END; }
+
+  friend std::ostream &operator<<(std::ostream &os, NumPyType const &type);
+
+  std::string str() const;
+};
+
+struct NumPyExpr;
+
+struct CodegenContext {
+  Module *M;
+  SeriesFlow *series;
+  BodiedFunc *func;
+  std::unordered_map<NumPyExpr *, Var *> vars;
+  NumPyPrimitiveTypes &T;
+
+  CodegenContext(Module *M, SeriesFlow *series, BodiedFunc *func,
+                 NumPyPrimitiveTypes &T);
+};
+
+enum BroadcastInfo {
+  UNKNOWN,
+  YES,
+  NO,
+  MAYBE,
+};
+
+struct NumPyExpr {
+  NumPyType type;
+  Value *val;
+  enum Op {
+    NP_OP_NONE,
+    NP_OP_POS,
+    NP_OP_NEG,
+    NP_OP_INVERT,
+    NP_OP_ABS,
+    NP_OP_TRANSPOSE,
+    NP_OP_ADD,
+    NP_OP_SUB,
+    NP_OP_MUL,
+    NP_OP_MATMUL,
+    NP_OP_TRUE_DIV,
+    NP_OP_FLOOR_DIV,
+    NP_OP_MOD,
+    NP_OP_FMOD,
+    NP_OP_POW,
+    NP_OP_LSHIFT,
+    NP_OP_RSHIFT,
+    NP_OP_AND,
+    NP_OP_OR,
+    NP_OP_XOR,
+    NP_OP_LOGICAL_AND,
+    NP_OP_LOGICAL_OR,
+    NP_OP_LOGICAL_XOR,
+    NP_OP_EQ,
+    NP_OP_NE,
+    NP_OP_LT,
+    NP_OP_LE,
+    NP_OP_GT,
+    NP_OP_GE,
+    NP_OP_MIN,
+    NP_OP_MAX,
+    NP_OP_FMIN,
+    NP_OP_FMAX,
+    NP_OP_SIN,
+    NP_OP_COS,
+    NP_OP_TAN,
+    NP_OP_ARCSIN,
+    NP_OP_ARCCOS,
+    NP_OP_ARCTAN,
+    NP_OP_ARCTAN2,
+    NP_OP_HYPOT,
+    NP_OP_SINH,
+    NP_OP_COSH,
+    NP_OP_TANH,
+    NP_OP_ARCSINH,
+    NP_OP_ARCCOSH,
+    NP_OP_ARCTANH,
+    NP_OP_CONJ,
+    NP_OP_EXP,
+    NP_OP_EXP2,
+    NP_OP_LOG,
+    NP_OP_LOG2,
+    NP_OP_LOG10,
+    NP_OP_EXPM1,
+    NP_OP_LOG1P,
+    NP_OP_SQRT,
+    NP_OP_SQUARE,
+    NP_OP_CBRT,
+    NP_OP_LOGADDEXP,
+    NP_OP_LOGADDEXP2,
+    NP_OP_RECIPROCAL,
+    NP_OP_RINT,
+    NP_OP_FLOOR,
+    NP_OP_CEIL,
+    NP_OP_TRUNC,
+    NP_OP_ISNAN,
+    NP_OP_ISINF,
+    NP_OP_ISFINITE,
+    NP_OP_SIGN,
+    NP_OP_SIGNBIT,
+    NP_OP_COPYSIGN,
+    NP_OP_SPACING,
+    NP_OP_NEXTAFTER,
+    NP_OP_DEG2RAD,
+    NP_OP_RAD2DEG,
+    NP_OP_HEAVISIDE,
+  } op;
+  std::unique_ptr<NumPyExpr> lhs;
+  std::unique_ptr<NumPyExpr> rhs;
+  bool freeable;
+
+  NumPyExpr(NumPyType type, Value *val)
+      : type(std::move(type)), val(val), op(NP_OP_NONE), lhs(), rhs(), freeable(false) {
+  }
+  NumPyExpr(NumPyType type, Value *val, NumPyExpr::Op op,
+            std::unique_ptr<NumPyExpr> lhs)
+      : type(std::move(type)), val(val), op(op), lhs(std::move(lhs)), rhs(),
+        freeable(false) {}
+  NumPyExpr(NumPyType type, Value *val, NumPyExpr::Op op,
+            std::unique_ptr<NumPyExpr> lhs, std::unique_ptr<NumPyExpr> rhs)
+      : type(std::move(type)), val(val), op(op), lhs(std::move(lhs)),
+        rhs(std::move(rhs)), freeable(false) {}
+
+  static std::unique_ptr<NumPyExpr>
+  parse(Value *v, std::vector<std::pair<NumPyExpr *, Value *>> &leaves,
+        NumPyPrimitiveTypes &T);
+
+  void replace(NumPyExpr &e);
+  bool haveVectorizedLoop() const;
+
+  int64_t opcost() const;
+  int64_t cost() const;
+
+  std::string opstring() const;
+  void dump(std::ostream &os, int level, int &leafId) const;
+  friend std::ostream &operator<<(std::ostream &os, NumPyExpr const &expr);
+  std::string str() const;
+
+  bool isLeaf() const { return !lhs && !rhs; }
+
+  int depth() const {
+    return std::max(lhs ? lhs->depth() : 0, rhs ? rhs->depth() : 0) + 1;
+  }
+
+  int nodes() const { return (lhs ? lhs->nodes() : 0) + (rhs ? rhs->nodes() : 0) + 1; }
+
+  void apply(std::function<void(NumPyExpr &)> f);
+
+  Value *codegenBroadcasts(CodegenContext &C);
+
+  Var *codegenFusedEval(CodegenContext &C);
+
+  Var *codegenSequentialEval(CodegenContext &C);
+
+  BroadcastInfo getBroadcastInfo();
+
+  Value *codegenScalarExpr(CodegenContext &C,
+                           const std::unordered_map<NumPyExpr *, Var *> &args,
+                           const std::unordered_map<NumPyExpr *, unsigned> &scalarMap,
+                           Var *scalars);
+};
+
+std::unique_ptr<NumPyExpr> parse(Value *v,
+                                 std::vector<std::pair<NumPyExpr *, Value *>> &leaves,
+                                 NumPyPrimitiveTypes &T);
+
+struct NumPyOptimizationUnit {
+  /// Original IR value being corresponding to expression
+  Value *value;
+  /// Function in which the value exists
+  BodiedFunc *func;
+  /// Root expression
+  std::unique_ptr<NumPyExpr> expr;
+  /// Leaves ordered by execution in original expression
+  std::vector<std::pair<NumPyExpr *, Value *>> leaves;
+  /// AssignInstr in which RHS is represented by this expression, or null if none
+  AssignInstr *assign;
+
+  bool optimize(NumPyPrimitiveTypes &T);
+};
+
+struct Forwarding {
+  NumPyOptimizationUnit *dst;
+  NumPyOptimizationUnit *src;
+  Var *var;
+  NumPyExpr *dstLeaf;
+  int64_t dstId;
+  int64_t srcId;
+};
+
+using ForwardingDAG =
+    std::unordered_map<NumPyOptimizationUnit *, std::vector<Forwarding>>;
+
+NumPyOptimizationUnit *doForwarding(ForwardingDAG &dag,
+                                    std::vector<AssignInstr *> &assignsToDelete);
+
+std::vector<ForwardingDAG> getForwardingDAGs(BodiedFunc *func,
+                                             analyze::dataflow::RDInspector *rd,
+                                             analyze::dataflow::CFGraph *cfg,
+                                             analyze::module::SideEffectResult *se,
+                                             std::vector<NumPyOptimizationUnit> &exprs);
+
+} // namespace numpy
+} // namespace transform
+} // namespace ir
+} // namespace codon
--- a/codon/cir/transform/parallel/openmp.cpp
+++ b/codon/cir/transform/parallel/openmp.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "openmp.h"

@ -402,7 +402,8 @@ struct ReductionIdentifier : public util::Operator {
  static void extractAssociativeOpChain(Value *v, const std::string &op,
                                        types::Type *type,
                                        std::vector<Value *> &result) {
-    if (util::isCallOf(v, op, {type, type}, type, /*method=*/true)) {
+    if (util::isCallOf(v, op, {type, nullptr}, type, /*method=*/true) ||
+        util::isCallOf(v, op, {nullptr, type}, type, /*method=*/true)) {
      auto *call = cast<CallInstr>(v);
      extractAssociativeOpChain(call->front(), op, type, result);
      extractAssociativeOpChain(call->back(), op, type, result);
@ -450,7 +451,8 @@ struct ReductionIdentifier : public util::Operator {

    for (auto &rf : reductionFunctions) {
      if (rf.method) {
-        if (!util::isCallOf(item, rf.name, {type, type}, type, /*method=*/true))
+        if (!(util::isCallOf(item, rf.name, {type, nullptr}, type, /*method=*/true) ||
+              util::isCallOf(item, rf.name, {nullptr, type}, type, /*method=*/true)))
          continue;
      } else {
        if (!util::isCallOf(item, rf.name,
@ -464,8 +466,7 @@ struct ReductionIdentifier : public util::Operator {

      if (rf.method) {
        std::vector<Value *> opChain;
-        extractAssociativeOpChain(callRHS, rf.name, callRHS->front()->getType(),
-                                  opChain);
+        extractAssociativeOpChain(callRHS, rf.name, type, opChain);
        if (opChain.size() < 2)
          continue;

@ -640,10 +641,11 @@ struct ParallelLoopTemplateReplacer : public LoopTemplateReplacer {

      auto *series = M->Nr<SeriesFlow>();
      auto *tupleVal = util::makeVar(reductionTuple, series, parent);
-      auto *reduceCode = util::call(
-          reduceNoWait, {M->Nr<VarValue>(reductionLocRef), M->Nr<VarValue>(gtid),
-                         tupleVal, rawReducer, M->Nr<PointerValue>(lck)});
-      auto *codeVar = util::makeVar(reduceCode, series, parent)->getVar();
+      auto *reduceCode =
+          util::call(reduceNoWait,
+                     {M->Nr<VarValue>(reductionLocRef), M->Nr<VarValue>(gtid),
+                      M->Nr<VarValue>(tupleVal), rawReducer, M->Nr<PointerValue>(lck)});
+      auto *codeVar = util::makeVar(reduceCode, series, parent);
      seqassertn(codeVar->getType()->is(M->getIntType()), "wrong reduce code type");

      auto *sectionNonAtomic = M->Nr<SeriesFlow>();
@ -740,11 +742,11 @@ struct ImperativeLoopTemplateReplacer : public ParallelLoopTemplateReplacer {
                         "unknown reduction init value");
            }

-            VarValue *newVar = util::makeVar(
-                initVal, cast<SeriesFlow>(parent->getBody()), parent, /*prepend=*/true);
-            sharedInfo.push_back({next, newVar->getVar(), reduction});
+            auto *newVar = util::makeVar(initVal, cast<SeriesFlow>(parent->getBody()),
+                                         parent, /*prepend=*/true);
+            sharedInfo.push_back({next, newVar, reduction});

-            newArg = M->Nr<PointerValue>(newVar->getVar());
+            newArg = M->Nr<PointerValue>(newVar);
            ++next;
          } else {
            newArg = util::tupleGet(M->Nr<VarValue>(extras), next++);
@ -918,9 +920,9 @@ struct TaskLoopRoutineStubReplacer : public ParallelLoopTemplateReplacer {
    for (auto *val : shareds) {
      if (getVarFromOutlinedArg(val)->getId() != loopVar->getId()) {
        if (auto &reduction = sharedRedux[sharedsNext]) {
-          Var *newVar = util::getVar(util::makeVar(
-              reduction.getInitial(), cast<SeriesFlow>(parent->getBody()), parent,
-              /*prepend=*/true));
+          auto *newVar = util::makeVar(reduction.getInitial(),
+                                       cast<SeriesFlow>(parent->getBody()), parent,
+                                       /*prepend=*/true);
          sharedInfo.push_back({sharedsNext, newVar, reduction});
        }
      }
@ -1050,7 +1052,7 @@ struct TaskLoopRoutineStubReplacer : public ParallelLoopTemplateReplacer {
      seqassertn(irArrayType, "could not find 'TaskReductionInputArray' type");
      auto *taskRedInputsArray = util::makeVar(
          M->Nr<StackAllocInstr>(irArrayType, numRed), taskRedInitSeries, parent);
-      array = util::getVar(taskRedInputsArray);
+      array = taskRedInputsArray;
      auto *taskRedInputsArrayType = taskRedInputsArray->getType();

      auto *taskRedSetItem = M->getOrRealizeMethod(
@ -1081,7 +1083,7 @@ struct TaskLoopRoutineStubReplacer : public ParallelLoopTemplateReplacer {
                                                     M->Nr<VarValue>(gtid),
                                                     M->getInt(numRed), arrayPtr}),
                        taskRedInitSeries, parent);
-      tskgrp = util::getVar(taskRedInitResult);
+      tskgrp = taskRedInitResult;
      v->replaceAll(taskRedInitSeries);
    }

@ -1345,14 +1347,13 @@ CollapseResult collapseLoop(BodiedFunc *parent, ImperativeForFlow *v, int64_t le
  for (auto *loop : loopNests) {
    LoopRange range;
    range.loop = loop;
-    range.start = util::makeVar(loop->getStart(), setup, parent)->getVar();
-    range.stop = util::makeVar(loop->getEnd(), setup, parent)->getVar();
+    range.start = util::makeVar(loop->getStart(), setup, parent);
+    range.stop = util::makeVar(loop->getEnd(), setup, parent);
    range.step = loop->getStep();
-    range.len = util::makeVar(util::call(lenCalc, {M->Nr<VarValue>(range.start),
-                                                   M->Nr<VarValue>(range.stop),
-                                                   M->getInt(range.step)}),
-                              setup, parent)
-                    ->getVar();
+    range.len = util::makeVar(
+        util::call(lenCalc, {M->Nr<VarValue>(range.start), M->Nr<VarValue>(range.stop),
+                             M->getInt(range.step)}),
+        setup, parent);
    ranges.push_back(range);
  }

@ -1374,11 +1375,9 @@ CollapseResult collapseLoop(BodiedFunc *parent, ImperativeForFlow *v, int64_t le
  for (auto it = ranges.rbegin(); it != ranges.rend(); ++it) {
    auto *k = lastDiv ? lastDiv : collapsedVar;
    auto *div =
-        util::makeVar(*M->Nr<VarValue>(k) / *M->Nr<VarValue>(it->len), body, parent)
-            ->getVar();
+        util::makeVar(*M->Nr<VarValue>(k) / *M->Nr<VarValue>(it->len), body, parent);
    auto *mod =
-        util::makeVar(*M->Nr<VarValue>(k) % *M->Nr<VarValue>(it->len), body, parent)
-            ->getVar();
+        util::makeVar(*M->Nr<VarValue>(k) % *M->Nr<VarValue>(it->len), body, parent);
    auto *i =
        *M->Nr<VarValue>(it->start) + *(*M->Nr<VarValue>(mod) * *M->getInt(it->step));
    body->push_back(M->Nr<AssignInstr>(it->loop->getVar(), i));
--- a/codon/cir/transform/parallel/openmp.h
+++ b/codon/cir/transform/parallel/openmp.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/transform/parallel/schedule.cpp
+++ b/codon/cir/transform/parallel/schedule.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "schedule.h"

--- a/codon/cir/transform/parallel/schedule.h
+++ b/codon/cir/transform/parallel/schedule.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/transform/pass.cpp
+++ b/codon/cir/transform/pass.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "pass.h"

--- a/codon/cir/transform/pass.h
+++ b/codon/cir/transform/pass.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/transform/pythonic/dict.cpp
+++ b/codon/cir/transform/pythonic/dict.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "dict.h"

--- a/codon/cir/transform/pythonic/dict.h
+++ b/codon/cir/transform/pythonic/dict.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/transform/pythonic/generator.cpp
+++ b/codon/cir/transform/pythonic/generator.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "generator.h"

@ -150,7 +150,7 @@ Func *genToSum(BodiedFunc *gen, types::Type *startType, types::Type *outType) {
  if (!init || !init->getType()->is(outType))
    return nullptr;

-  auto *accumulator = util::makeVar(init, body, fn, /*prepend=*/true)->getVar();
+  auto *accumulator = util::makeVar(init, body, fn, /*prepend=*/true);
  GeneratorSumTransformer xgen(accumulator);
  fn->accept(xgen);
  body->push_back(M->Nr<ReturnInstr>(M->Nr<VarValue>(accumulator)));
--- a/codon/cir/transform/pythonic/generator.h
+++ b/codon/cir/transform/pythonic/generator.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/transform/pythonic/io.cpp
+++ b/codon/cir/transform/pythonic/io.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "io.h"

--- a/codon/cir/transform/pythonic/io.h
+++ b/codon/cir/transform/pythonic/io.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/transform/pythonic/list.cpp
+++ b/codon/cir/transform/pythonic/list.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "list.h"

@ -45,7 +45,7 @@ struct ElementHandler {
  void doSetup(const std::vector<Value *> &values, SeriesFlow *block,
               BodiedFunc *parent) {
    for (auto *v : values) {
-      vars.push_back(util::makeVar(v, block, parent)->getVar());
+      vars.push_back(util::makeVar(v, block, parent));
    }
  }

@ -226,7 +226,7 @@ Value *optimize(BodiedFunc *parent, InspectionResult &r) {
  }

  auto *opt = M->Nr<SeriesFlow>();
-  auto *len = util::makeVar(M->getInt(0), opt, parent)->getVar();
+  auto *len = util::makeVar(M->getInt(0), opt, parent);

  for (auto &h : handlers) {
    h->setup(opt, parent);
@ -238,8 +238,7 @@ Value *optimize(BodiedFunc *parent, InspectionResult &r) {

  auto *fn = M->getOrRealizeMethod(ty, "_list_add_opt_opt_new", {M->getIntType()});
  seqassertn(fn, "could not find list new helper");
-  auto *result =
-      util::makeVar(util::call(fn, {M->Nr<VarValue>(len)}), opt, parent)->getVar();
+  auto *result = util::makeVar(util::call(fn, {M->Nr<VarValue>(len)}), opt, parent);

  for (auto &h : handlers) {
    opt->push_back(h->append(M->Nr<VarValue>(result)));
--- a/codon/cir/transform/pythonic/list.h
+++ b/codon/cir/transform/pythonic/list.h
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #pragma once

--- a/codon/cir/transform/pythonic/str.cpp
+++ b/codon/cir/transform/pythonic/str.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2022-2024 Exaloop Inc. <https://exaloop.io>
+// Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>

 #include "str.h"

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Jonas Neubert	dcb41dcfc9	codon build command: add --cir output type option (#649 )	2025-04-22 11:46:03 -04:00
A. R. Shajii	c1dae7d87d	Update OpenBLAS	2025-04-04 14:59:13 -04:00
A. R. Shajii	984974b40d	Support CMake 4.0	2025-04-04 11:27:35 -04:00
A. R. Shajii	915cb4e9f0	Support converting bytes object to Codon str (#646 )	2025-04-03 10:41:19 -04:00
A. R. Shajii	ce5c49edb5	Fix typo in docs and README	2025-04-03 10:39:45 -04:00
A. R. Shajii	59f5bbb73b	Bump versions	2025-03-18 10:46:58 -04:00
A. R. Shajii	93fb3d53e3	JIT argument order fix (#639 ) * Fix argument ordering in JIT * Format * Update JIT tests * Fix JIT test	2025-03-18 10:45:34 -04:00
A. R. Shajii	b3f6c12d57	Fix 0d array conversions from Python	2025-03-03 11:31:49 -05:00
A. R. Shajii	b17d21513d	Remove -static-libstdc++ compilation flag	2025-02-18 14:49:45 -05:00
Ibrahim Numanagić	d035f1dc97	C-based Cython Backend (#629 ) * Move to C-based Cython backend (to avoid all those C++ ABI issues with std::string) * Fix CI	2025-02-18 10:22:03 -05:00
A. R. Shajii	dc5e5ac7a6	Bump version	2025-02-11 22:04:22 -05:00
A. R. Shajii	01a7503762	Bump version	2025-02-11 17:41:16 -05:00
A. R. Shajii	f1ab7116d8	Fix np.pad() casting	2025-02-11 15:49:15 -05:00
A. R. Shajii	b58b1ee767	Update OpenMP reduction detection for new ops	2025-02-07 12:04:12 -05:00
A. R. Shajii	56c00d36c2	Add additional int-float operators	2025-02-06 14:11:52 -05:00
A. R. Shajii	4521182aa8	Update np.correlate()	2025-02-04 17:32:54 -05:00
A. R. Shajii	44c59c2a03	Fix artifact names	2025-01-29 20:17:05 -05:00
A. R. Shajii	15c43eb94e	Publish to PyPI in workflow	2025-01-29 15:52:50 -05:00
A. R. Shajii	b8c1eeed36	2025 updates (#619 ) * 2025 updates * Update ci.yml	2025-01-29 15:41:43 -05:00
A. R. Shajii	d13d6a58e3	Fix doc subcommand if no path given	2024-11-13 11:30:00 -05:00
Ibrahim Numanagić	37ff25a907	Fix underscore float parsing (#596 ) * Fix underscore float parsing * Add tests * Update float parsing --------- Co-authored-by: A. R. Shajii <ars@ars.me>	2024-10-01 15:35:11 -04:00