uxlfoundation · rjoursler · Dec 3, 2025 · Nov 21, 2025 · Nov 21, 2025 · Nov 21, 2025
@@ -116,7 +116,7 @@ def get_file_guard(path):
     if path.startswith("src/gpu/intel/gemm/jit/"):
         base = os.path.basename(path)
         if path != "src/gpu/intel/gemm/jit/" + base:
-            path = "src/gemmstone_guard/" + os.path.basename(path)
+            path = path.replace("gpu/intel/gemm/jit", "gemmstone")
     elif path.startswith("src/gpu/intel/microkernels"):
         path = path.replace("intel/", "")
     guard = path

@@ -19,7 +19,7 @@
 
 #include "common/c_types_map.hpp"
 #include "common/reorder.hpp"
-#include "gpu/intel/binary/config.hpp"
+#include "gpu/gpu_binary_pd.hpp"
 #include "gpu/intel/primitive.hpp"
 
 namespace dnnl {
@@ -28,6 +28,8 @@ namespace gpu {
 namespace intel {
 namespace binary {
 
+using pd_t = gpu_binary_pd_t;
+
 struct multi_po_reorder_t : public primitive_t {
     using primitive_t::primitive_t;
     struct pd_t : public binary::pd_t {

@@ -90,6 +90,23 @@ block_layout_t::block_layout_t(
     if (do_normalize) *this = normalized();
 }
 
+block_layout_t get_inner_layout(const memory_desc_wrapper &md) {
+    block_layout_t inner_layout(md, /* inner_only */ true);
+
+    block_layout_t ret;
+    // Explicitly initialize to size-1 blocks
+    for (int d = 0; d < MAX_NDIMS; d++) {
+        ret.append(block_t(d, 1, 0));
+    }
+
+    // Overwrite inner blocks with their actual values
+    for (const auto &block : inner_layout) {
+        ret[block.dim_idx] = block;
+    }
+
+    return ret;
+}
+
 } // namespace intel
 } // namespace gpu
 } // namespace impl

@@ -188,6 +188,8 @@ struct block_layout_t {
 std::vector<block_t> normalize_blocks(
         const std::vector<block_t> &blocks, bool remove_size_1_blocks = true);
 
+block_layout_t get_inner_layout(const memory_desc_wrapper &md);
+
 } // namespace intel
 } // namespace gpu
 } // namespace impl

@@ -64,6 +64,11 @@ uint64_t get_future_extensions(
     return extensions;
 }
 
+ngen::HW device_info_t::ngen_hw() const {
+    ngen::Product p = jit::get_ngen_product(*this);
+    return ngen::getCore(p.family);
+}
+
 int device_info_t::stepping_id() const {
     ngen::Product p = jit::get_ngen_product(*this);
     return p.stepping;

@@ -30,6 +30,11 @@
 
 #include "oneapi/dnnl/dnnl_config.h"
 
+namespace ngen {
+enum class Core;
+using HW = Core;
+} // namespace ngen
+
 namespace dnnl {
 namespace impl {
 namespace gpu {
@@ -190,6 +195,7 @@ struct device_info_t {
     bool has_native(native_ext_t ext) const { return native_extensions_ & (uint64_t)ext; }
     gpu_arch_t gpu_arch() const { return gpu_arch_; }
     const gpu_product_t &gpu_product() const {return gpu_product_;}
+    ngen::HW ngen_hw() const;
     int stepping_id() const;
     uint64_t native_extensions() const { return native_extensions_; }
     bool is_integrated() const;

@@ -365,9 +365,9 @@ class gen_t {
                     if (zp.get(key).is_host_scalar()) switch (key) {
                             case DNNL_ARG_SRC:
                                 return (zp.has_default_values(DNNL_ARG_WEIGHTS))
-                                        ? type_t::f32()
-                                        : type_t::s32();
-                            case DNNL_ARG_DST: return type_t::f32();
+                                        ? dsl::type_t::f32()
+                                        : dsl::type_t::s32();
+                            case DNNL_ARG_DST: return dsl::type_t::f32();
                             default: return to_ir(zp.get(key).get_data_type());
                         }
                 } else if (key & DNNL_ARG_ATTR_SCALES) {
@@ -376,7 +376,7 @@ class gen_t {
                     if (sc.get(key).is_host_scalar())
                         return to_ir(sc.get(key).get_data_type());
                 }
-                return type_t::byte(type::attr_t::ptr);
+                return dsl::type_t::byte(dsl::type::attr_t::ptr);
             };
             const bool wei_reorder_precalc = (t.name == "wei")
                     && cfg.zp_cfg().needs_src_reorder_precalc;
@@ -412,7 +412,7 @@ class gen_t {
             auto create_zero_out_info = [&]() -> kernel_info_t & {
                 auto &zero_out_info
                         = create_kernel_info(pd, kernel_id_t::zero_out);
-                auto size_var = var_t::make(type_t::u32(), "size");
+                auto size_var = var_t::make(dsl::type_t::u32(), "size");
                 zero_out_info.register_immediate_arg(
                         size_var, into<uint32_t>(compute_size));
                 zero_out_info.set_nd_range(zero_out_kernel_desc_t::nd_range(

@@ -97,7 +97,7 @@ std::string prepend_groups_to_tag(const std::string &tag) {
     return "a" + ret;
 }
 
-int get_default_mad_block(const type_t &type) {
+int get_default_mad_block(const dsl::type_t &type) {
     switch (type.size()) {
         // fp4 gets upconverted to f16 for mad.
         case 1: return (type.is_fp4() ? 16 : 32);
@@ -109,7 +109,7 @@ int get_default_mad_block(const type_t &type) {
     return 1;
 }
 
-bool is_small(const type_t &type, dim_t elems) {
+bool is_small(const dsl::type_t &type, dim_t elems) {
     int block = get_default_mad_block(type);
     return elems <= block / 2;
 }
@@ -195,7 +195,7 @@ status_t problem_t::init(
     isp = id * ih * iw;
     osp = od * oh * ow;
 
-    hw_t hw(make_ir_hw(engine));
+    dsl::hw_t hw(make_ir_hw(engine));
     init_transpose(hw);
     CHECK(init_abc_data_types(hw));
     CHECK(init_acc_data_type());
@@ -317,7 +317,7 @@ int pick_block(dim_t dim, int b0, int b1 = 0, int b2 = 0) {
     return pick_block_impl(false, dim, b0, b1, b2);
 }
 
-int get_default_block(fma_kind_t fma, const type_t &type, dim_t elems) {
+int get_default_block(fma_kind_t fma, const dsl::type_t &type, dim_t elems) {
     if (is_dp_fma(fma)) {
         if (is_small(type, elems)) {
             int packed_dword_elems = 32 / type.bitsize();
@@ -330,7 +330,7 @@ int get_default_block(fma_kind_t fma, const type_t &type, dim_t elems) {
     return get_default_mad_block(type);
 }
 
-fma_kind_t get_default_fma(const hw_t &hw, const type_t &type) {
+fma_kind_t get_default_fma(const dsl::hw_t &hw, const dsl::type_t &type) {
     switch (type.size()) {
         case 1:
             if (hw >= ngen::HW::XeHP) return fma_kind_t::dpas;
@@ -356,8 +356,8 @@ struct nc_block_t {
 
     // Ideally, this should only depend on data type, direction, mb, c, and g to
     // enable the same src/dst formats and avoid reorders between convolutions
-    static nc_block_t get_default_blocking(const hw_t &hw, fma_kind_t fma,
-            type_t type, bool is_dw, dim_t n, dim_t c, dim_t g,
+    static nc_block_t get_default_blocking(const dsl::hw_t &hw, fma_kind_t fma,
+            dsl::type_t type, bool is_dw, dim_t n, dim_t c, dim_t g,
             bool is_output = false) {
         // Select dst layout to align with fma kind of following conv
         // for non-depthwise cases.
@@ -417,7 +417,7 @@ struct goi_block_t {
                 {1, o_block_outer_, i_block_outer_}, wei_letters, wei_idxs);
     }
 
-    static goi_block_t get_default_blocking(type_t type, int vec_size,
+    static goi_block_t get_default_blocking(dsl::type_t type, int vec_size,
             fma_kind_t fma_kind, bool is_fwd, bool is_bwd_d, dim_t g, dim_t o,
             dim_t i, bool ab_transpose) {
         dim_t x = o;
@@ -443,7 +443,7 @@ struct goi_block_t {
                 i_block, o_block_outer, i_block_outer);
     }
 
-    static void get_default_blocking(type_t type, int vec_size,
+    static void get_default_blocking(dsl::type_t type, int vec_size,
             fma_kind_t fma_kind, bool is_fwd, bool is_bwd_d, dim_t g, dim_t x,
             dim_t y, int &g_block, int &x_block, int &y_block,
             int &y_block_outer, bool ab_transpose = false) {
@@ -791,10 +791,10 @@ status_t init_tensor_layouts(
     if (prb.is_bwd_w) {
         if (utils::one_of(prb.wei_data_type, data_type::bf16, data_type::f16,
                     data_type::f8_e5m2, data_type::f8_e4m3))
-            wei_layout = wei_layout.with(type_t::f32());
+            wei_layout = wei_layout.with(dsl::type_t::f32());
         if (utils::one_of(prb.bia_data_type, data_type::bf16, data_type::f16,
                     data_type::f8_e5m2, data_type::f8_e4m3))
-            bia_layout = bia_layout.with(type_t::f32());
+            bia_layout = bia_layout.with(dsl::type_t::f32());
     }
 
     src.set_compute_unnormalized(src_layout, src_tag);
@@ -875,13 +875,13 @@ status_t init_tensor_layouts(
     return status::success;
 }
 
-bool hw_ok(const hw_t &hw) {
+bool hw_ok(const dsl::hw_t &hw) {
     if (hw < ngen::HW::XeLP) return false;
     return true;
 }
 
 bool data_types_ok(
-        const problem_t &prb, const hw_t &hw, impl::engine_t *engine) {
+        const problem_t &prb, const dsl::hw_t &hw, impl::engine_t *engine) {
     auto src = prb.src_data_type;
     auto wei = prb.wei_data_type;
     auto dst = prb.dst_data_type;
@@ -960,7 +960,7 @@ bool zero_points_ok(const problem_t &prb) {
     return true;
 }
 
-bool post_ops_ok(const problem_t &prb, const hw_t &hw) {
+bool post_ops_ok(const problem_t &prb, const dsl::hw_t &hw) {
     auto *pd = prb.conv_pd;
     auto *attr = prb.attr;
 
@@ -1135,7 +1135,7 @@ void init_bwd_d_optimize(config_t &cfg) {
 
 status_t init_pd_time_cfg(const problem_t &prb, config_t &cfg,
         impl::engine_t *engine, convolution_pd_t *pd, primitive_attr_t *attr) {
-    hw_t hw(make_ir_hw(engine));
+    dsl::hw_t hw(make_ir_hw(engine));
 
     VDISPATCH_CHECK(pd, engine, hw_ok(hw), VERBOSE_UNSUPPORTED_ISA);
     VDISPATCH_CHECK(
@@ -1148,7 +1148,7 @@ status_t init_pd_time_cfg(const problem_t &prb, config_t &cfg,
     zero_points_config_t zp_cfg(pd);
     cfg.set_zp_cfg(zp_cfg);
     cfg.set_prb(prb);
-    cfg.set_options(kernel::options_t(hw));
+    cfg.set_options(dsl::kernel::options_t(hw));
     cfg.maybe_override_from_env();
 
     CHECK(init_fma_kind(cfg, pd, engine));
@@ -1165,7 +1165,7 @@ status_t init_pd_time_cfg(const problem_t &prb, config_t &cfg,
 }
 
 bool pipeline_unroll_hint(const problem_t &prb, fma_kind_t fma_kind,
-        const kernel::options_t &options,
+        const dsl::kernel::options_t &options,
         bwd_d_optimize_kind_t bwd_d_optimize_kind,
         bool allow_global_reduction) {
     bool do_unroll = true;
@@ -1853,8 +1853,10 @@ void validate_config_and_plan(config_t &cfg) {
         b_load_pattern = validate_blocking(
                 cfg, stride_layout_t::input_tensor_t::dst, b_2d);
     }
-    auto dummy_mem(var_t::make(type_t::byte(type::attr_t::ptr), "mem"));
-    auto dummy_reg(var_t::make(type_t::byte(type::attr_t::ptr), "reg"));
+    auto dummy_mem(
+            var_t::make(dsl::type_t::byte(dsl::type::attr_t::ptr), "mem"));
+    auto dummy_reg(
+            var_t::make(dsl::type_t::byte(dsl::type::attr_t::ptr), "reg"));
     if (!a_load_pattern.matches(
                 plan.x2r.a_load.create_stmt(dummy_mem, dummy_reg))) {
         gpu_warning() << "Generated load for tensor A does not match "

@@ -543,7 +543,7 @@ class config_t : public prim_config_t {
 
     int reserved_regs() const;
 
-    const hw_t &hw() const { return options().hw(); }
+    const dsl::hw_t &hw() const { return options().hw(); }
 
     bool is_ge_xe_hpc() const { return hw() >= ngen::HW::XeHPC; }
 
@@ -691,7 +691,7 @@ int slm_bufs_hint(const problem_t &prb, dim_t m_tg, dim_t n_tg,
         bool do_unroll);
 tensor_config_t get_tensor_config(
         const config_t &cfg, const memory_desc_t *zp_src);
-bool is_small(const type_t &type, dim_t elems);
+bool is_small(const dsl::type_t &type, dim_t elems);
 int estimate_register_count(const config_t &cfg);
 int default_regs(const config_t &cfg);
 void init_kernel_grid(config_t &cfg);