fix examples

makslevental · makslevental · commit b6a718d608df · 2025-11-24T18:22:24.000-08:00
diff --git a/projects/eudsl-python-extras/examples/cuda_e2e.ipynb b/projects/eudsl-python-extras/examples/cuda_e2e.ipynb
@@ -103,10 +103,10 @@
     "from mlir import _mlir_libs\n",
     "from mlir.extras.ast.canonicalize import canonicalize\n",
     "from mlir.extras.context import RAIIMLIRContext, ExplicitlyManagedModule\n",
-    "from mlir.extras.dialects.ext import arith, memref, scf, gpu\n",
-    "from mlir.extras.dialects.ext import linalg\n",
-    "from mlir.extras.dialects.ext import transform\n",
-    "from mlir.extras.dialects.ext.func import func\n",
+    "from mlir.extras.dialects import arith, memref, scf, gpu\n",
+    "from mlir.extras.dialects import linalg\n",
+    "from mlir.extras.dialects import transform\n",
+    "from mlir.extras.dialects.func import func\n",
     "from mlir.extras.runtime.passes import Pipeline, run_pipeline\n",
     "from mlir.extras.runtime.refbackend import LLVMJITBackend\n",
     "from mlir.extras.util import find_ops\n",
diff --git a/projects/eudsl-python-extras/examples/cuda_matmul_opt.py b/projects/eudsl-python-extras/examples/cuda_matmul_opt.py
@@ -11,17 +11,17 @@
     mlir_mod_ctx,
     MLIRContext,
 )
-from mlir.extras.dialects.ext import arith, memref, gpu, scf, linalg, vector, nvgpu
-from mlir.extras.dialects.ext.gpu import (
+from mlir.extras.dialects import arith, memref, gpu, scf, linalg, vector, nvgpu
+from mlir.extras.dialects.gpu import (
     block_idx,
     thread_idx,
     block_dim,
     get_compile_object_bytes,
     smem_space,
 )
-from mlir.extras.dialects.ext.llvm import llvm_ptr_t
-from mlir.extras.dialects.ext.memref import S
-from mlir.extras.dialects.ext.scf import range_
+from mlir.extras.dialects.llvm import llvm_ptr_t
+from mlir.extras.dialects.memref import S
+from mlir.extras.dialects.scf import range_
 from mlir.extras.runtime.passes import Pipeline, run_pipeline
 
 # noinspection PyUnresolvedReferences
@@ -139,9 +139,9 @@ def sgemm_naive[
     K,
     N,
     dtype,
-    A_t: T.memref(M, K, dtype),
-    B_t: T.memref(K, N, dtype),
-    C_t: T.memref(M, N, dtype),
+    A_t = T.memref(M, K, dtype),
+    B_t = T.memref(K, N, dtype),
+    C_t = T.memref(M, N, dtype),
 ](A: A_t, B: B_t, C: C_t):
     one = arith.constant(1.0, type=dtype)
     tmp = arith.constant(0, type=dtype)
@@ -167,9 +167,9 @@ def sgemm_naive_row_order[
     K,
     N,
     dtype,
-    A_t: T.memref(M, K, dtype),
-    B_t: T.memref(K, N, dtype),
-    C_t: T.memref(M, N, dtype),
+    A_t = T.memref(M, K, dtype),
+    B_t = T.memref(K, N, dtype),
+    C_t = T.memref(M, N, dtype),
 ](A: A_t, B: B_t, C: C_t):
     one = arith.constant(1.0, type=dtype)
     tmp = arith.constant(0, type=dtype)
@@ -193,10 +193,10 @@ def sgemm_coalesce[
     K,
     N,
     dtype,
-    BLOCK_SIZE: 32,
-    A_t: T.memref(M, K, dtype),
-    B_t: T.memref(K, N, dtype),
-    C_t: T.memref(M, N, dtype),
+    BLOCK_SIZE = 32,
+    A_t = T.memref(M, K, dtype),
+    B_t = T.memref(K, N, dtype),
+    C_t = T.memref(M, N, dtype),
 ](A: A_t, B: B_t, C: C_t):
 
     tid = gpu.thread_id()
@@ -259,10 +259,10 @@ def sgemm_coalesce_transpose_B[
     K,
     N,
     dtype,
-    BLOCK_SIZE: 32,
-    A_t: T.memref(M, K, dtype),
-    B_t: T.memref(K, N, dtype),
-    C_t: T.memref(M, N, dtype),
+    BLOCK_SIZE = 32,
+    A_t = T.memref(M, K, dtype),
+    B_t = T.memref(K, N, dtype),
+    C_t = T.memref(M, N, dtype),
 ](A: A_t, B: B_t, C: C_t):
 
     tid = gpu.thread_id()
@@ -288,10 +288,10 @@ def sgemm_shared_mem_block[
     K,
     N,
     dtype,
-    BLOCK_SIZE: 32,
-    A_t: T.memref(M, K, dtype),
-    B_t: T.memref(K, N, dtype),
-    C_t: T.memref(M, N, dtype),
+    BLOCK_SIZE = 32,
+    A_t = T.memref(M, K, dtype),
+    B_t = T.memref(K, N, dtype),
+    C_t = T.memref(M, N, dtype),
 ](A: A_t, B: B_t, C: C_t):
     # allocate buffer for current block in fast shared mem
     # shared mem is shared between all threads in a block
@@ -394,9 +394,9 @@ def sgemm_shared_mem_1d_block_tiling[
     BN,
     BK,
     TM,
-    A_t: T.memref(M, K, dtype),
-    B_t: T.memref(K, N, dtype),
-    C_t: T.memref(M, N, dtype),
+    A_t = T.memref(M, K, dtype),
+    B_t = T.memref(K, N, dtype),
+    C_t = T.memref(M, N, dtype),
 ](A: A_t, B: B_t, C: C_t):
     base = gpu.dynamic_shared_memory()
     A_shared = memref.view(base, (BM, BK), dtype=dtype)
@@ -455,9 +455,9 @@ def sgemm_shared_mem_2d_block_tiling[
     BK,
     TM,
     TN,
-    A_t: T.memref(M, K, dtype),
-    B_t: T.memref(K, N, dtype),
-    C_t: T.memref(M, N, dtype),
+    A_t = T.memref(M, K, dtype),
+    B_t = T.memref(K, N, dtype),
+    C_t = T.memref(M, N, dtype),
 ](A: A_t, B: B_t, C: C_t):
     base = gpu.dynamic_shared_memory()
     A_shared = memref.view(base, (BM, BK), dtype=dtype)
@@ -542,9 +542,9 @@ def sgemm_shared_mem_2d_block_tiling_vectorize[
     BK,
     TM,
     TN,
-    A_t: T.memref(M, K, dtype),
-    B_t: T.memref(K, N, dtype),
-    C_t: T.memref(M, N, dtype),
+    A_t = T.memref(M, K, dtype),
+    B_t = T.memref(K, N, dtype),
+    C_t = T.memref(M, N, dtype),
 ](A: A_t, B: B_t, C: C_t):
     VECTOR_WIDTH = 4
     DTYPE_WIDTH = dtype.width // 8
@@ -656,9 +656,9 @@ def sgemm_warp_tiling[
     TM,
     TN,
     NUM_THREADS,
-    A_t: T.memref(M, K, dtype),
-    B_t: T.memref(K, N, dtype),
-    C_t: T.memref(M, N, dtype),
+    A_t = T.memref(M, K, dtype),
+    B_t = T.memref(K, N, dtype),
+    C_t = T.memref(M, N, dtype),
 ](A: A_t, B: B_t, C: C_t):
     VECTOR_WIDTH = 4
     DTYPE_WIDTH = dtype.width // 8
@@ -820,11 +820,11 @@ def sgemm_tensor_core[
     M,
     K,
     N,
-    A_t: T.memref(M, K, T.f16()),
-    B_t: T.memref(K, N, T.f16()),
-    C_t: T.memref(M, N, T.f32()),
-    a_tma_t: llvm_ptr_t(),
-    b_tma_t: llvm_ptr_t(),
+    A_t = T.memref(M, K, T.f16()),
+    B_t = T.memref(K, N, T.f16()),
+    C_t = T.memref(M, N, T.f32()),
+    a_tma_t = llvm_ptr_t(),
+    b_tma_t = llvm_ptr_t(),
 ](A: A_t, B: B_t, C: C_t, a_tma: a_tma_t, b_tma: b_tma_t):
     a_tma = builtin.unrealized_conversion_cast(
         [
@@ -987,7 +987,7 @@ def prepare_warp_tiled_kernel(ctx: MLIRContext, kernel, M, K, N):
     def matmul_mod():
         kernel[M, K, N, dtype, BM, BN, BK, WM, WN, WNITER, TM, TN, NUM_THREADS].emit()
 
-    # print(ctx.module)
+    print(ctx.module)
     assert ctx.module.operation.verify()
 
     if cuda_bindings_not_installed():
diff --git a/projects/eudsl-python-extras/examples/flash_attention.py b/projects/eudsl-python-extras/examples/flash_attention.py
@@ -6,11 +6,11 @@
 
 from mlir.extras.ast.canonicalize import canonicalize
 from mlir.extras.context import RAIIMLIRContextModule
-from mlir.extras.dialects.ext import memref, scf, arith, gpu, llvm
+from mlir.extras.dialects import memref, scf, arith, gpu, llvm
 from mlir.dialects import math
 
 # noinspection PyUnresolvedReferences
-from mlir.extras.dialects.ext.gpu import (
+from mlir.extras.dialects.gpu import (
     block_idx,
     thread_idx,
     grid_dim,
@@ -222,7 +222,7 @@ def flash_attention(
 ip.__exit__(None, None, None)
 
 assert gpu_module.operation.verify()
-# print(gpu_module)
+print(gpu_module)
 
 sram_size = 4 * Bc * d * np.float32().itemsize
 
diff --git a/projects/eudsl-python-extras/examples/mlir_python_extras.ipynb b/projects/eudsl-python-extras/examples/mlir_python_extras.ipynb
@@ -64,17 +64,17 @@
     "import mlir.extras.types as T\n",
     "from mlir.extras.ast.canonicalize import canonicalize\n",
     "from mlir.extras.context import mlir_mod_ctx\n",
-    "from mlir.extras.dialects.ext.arith import constant\n",
-    "from mlir.extras.dialects.ext.memref import S\n",
-    "from mlir.extras.dialects.ext.func import func\n",
-    "from mlir.extras.dialects.ext.scf import canonicalizer as scf, range_\n",
+    "from mlir.extras.dialects.arith import constant\n",
+    "from mlir.extras.dialects.memref import S\n",
+    "from mlir.extras.dialects.func import func\n",
+    "from mlir.extras.dialects.scf import canonicalizer as scf, range_\n",
     "from mlir.extras.runtime.passes import Pipeline, run_pipeline\n",
     "from mlir.extras.runtime.refbackend import LLVMJITBackend\n",
     "from mlir.ir import StridedLayoutAttr\n",
     "\n",
     "# you need this to register the memref value caster\n",
     "# noinspection PyUnresolvedReferences\n",
-    "import mlir.extras.dialects.ext.memref\n",
+    "import mlir.extras.dialects.memref\n",
     "\n",
     "ctx_man = mlir_mod_ctx()\n",
     "ctx = ctx_man.__enter__()\n",
@@ -417,7 +417,7 @@
     "layout = StridedLayoutAttr.get(S, (K, 1))\n",
     "ranked_memref_dxd_f32 = T.memref(D, D, T.f32(), layout=layout)\n",
     "\n",
-    "from mlir.extras.dialects.ext import linalg\n",
+    "from mlir.extras.dialects import linalg\n",
     "\n",
     "@func(emit=True)\n",
     "@canonicalize(using=scf)\n",
diff --git a/projects/eudsl-python-extras/examples/mwe.py b/projects/eudsl-python-extras/examples/mwe.py
@@ -11,7 +11,7 @@
 
 # you need this to register the memref value caster
 # noinspection PyUnresolvedReferences
-import mlir.extras.dialects.ext.memref
+import mlir.extras.dialects.memref
 from mlir.extras.context import RAIIMLIRContext, ExplicitlyManagedModule
 from mlir.dialects.bufferization import LayoutMapOption
 from mlir.dialects.transform.vector import (
@@ -20,15 +20,15 @@
     VectorTransferSplit,
     VectorTransposeLowering,
 )
-from mlir.extras.dialects.ext import linalg
-from mlir.extras.dialects.ext.func import func
-from mlir.extras.dialects.ext.transform import (
+from mlir.extras.dialects import linalg
+from mlir.extras.dialects.func import func
+from mlir.extras.dialects.transform import (
     match,
     tile_to_scf_for,
     get_parent_op,
     transform_any_op_t,
 )
-from mlir.extras.dialects.ext import transform
+from mlir.extras.dialects import transform
 from mlir.extras.runtime.passes import Pipeline, run_pipeline
 from mlir.extras.runtime.refbackend import LLVMJITBackend
 
diff --git a/projects/eudsl-python-extras/examples/rdna_matmul_opt.py b/projects/eudsl-python-extras/examples/rdna_matmul_opt.py
@@ -2,13 +2,13 @@
 
 from mlir.extras.ast.canonicalize import canonicalize
 from mlir.extras.context import RAIIMLIRContextModule
-from mlir.extras.dialects.ext import memref, scf, arith, gpu, llvm
+from mlir.extras.dialects import memref, scf, arith, gpu, llvm
 from mlir.dialects import index as index_dialect
 from mlir.ir import InsertionPoint, IntegerAttr, UnitAttr, Attribute
 import mlir.extras.types as T
 
 # noinspection PyUnresolvedReferences
-from mlir.extras.dialects.ext.gpu import (
+from mlir.extras.dialects.gpu import (
     all_reduce,
     wait,
     thread_attr as thread,
@@ -721,7 +721,7 @@ def kernel5_lds_optim(
 )
 
 assert simplified_module.operation.verify()
-# print(simplified_module)
+print(simplified_module)
 
 lowered_module = run_pipeline(
     simplified_module,
diff --git a/projects/eudsl-python-extras/examples/vectorization_e2e.ipynb b/projects/eudsl-python-extras/examples/vectorization_e2e.ipynb
@@ -71,7 +71,7 @@
     "\n",
     "# you need this to register the memref value caster\n",
     "# noinspection PyUnresolvedReferences\n",
-    "import mlir.extras.dialects.ext.memref\n",
+    "import mlir.extras.dialects.memref\n",
     "from mlir.extras.context import RAIIMLIRContext, ExplicitlyManagedModule\n",
     "from mlir.dialects.bufferization import LayoutMapOption\n",
     "from mlir.dialects.transform.vector import (\n",
@@ -80,15 +80,15 @@
     "    VectorTransferSplit,\n",
     "    VectorTransposeLowering,\n",
     ")\n",
-    "from mlir.extras.dialects.ext import linalg\n",
-    "from mlir.extras.dialects.ext.func import func\n",
-    "from mlir.extras.dialects.ext.transform import (\n",
+    "from mlir.extras.dialects import linalg\n",
+    "from mlir.extras.dialects.func import func\n",
+    "from mlir.extras.dialects.transform import (\n",
     "    match,\n",
     "    tile_to_scf_for,\n",
     "    get_parent_op,\n",
     "    transform_any_op_t,\n",
     ")\n",
-    "from mlir.extras.dialects.ext import transform\n",
+    "from mlir.extras.dialects import transform\n",
     "from mlir.extras.runtime.passes import Pipeline, run_pipeline\n",
     "from mlir.extras.runtime.refbackend import LLVMJITBackend\n"
    ],