Skip to content

Commit 3a0c658

Browse files
authored
Intel(R) SHMEM (ISHMEM) 1.5.1 (#14)
Signed-off-by: sys_shmem <[email protected]>
1 parent 21b3dfc commit 3a0c658

File tree

9 files changed

+419
-3
lines changed

9 files changed

+419
-3
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ set(PROJECT_FULL_NAME "Intel® SHMEM")
1313

1414
set(ISHMEM_MAJOR_VERSION "1")
1515
set(ISHMEM_MINOR_VERSION "5")
16-
set(ISHMEM_PATCH_VERSION "0")
16+
set(ISHMEM_PATCH_VERSION "1")
1717
set(PROJECT_VERSION "${ISHMEM_MAJOR_VERSION}.${ISHMEM_MINOR_VERSION}.${ISHMEM_PATCH_VERSION}")
1818

1919
project(${PROJECT_NAME} VERSION ${PROJECT_VERSION} LANGUAGES C CXX)

RELEASE_NOTES.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,23 @@
11
# Release Notes <!-- omit in toc -->
22
This document contains a list of new features and known limitations of Intel® SHMEM releases.
33

4+
## Release 1.5.1
5+
6+
### New Features and Enhancements
7+
- Fix a compilation issue due to missing files
8+
9+
### Known Limitations
10+
- Only [Sandia OpenSHMEM](https://github.com/Sandia-OpenSHMEM/SOS) and [Intel® MPI Library](https://www.intel.com/content/www/us/en/developer/tools/oneapi/mpi-library.html) are currently supported as the host back-end.
11+
- Not all APIs from OpenSHMEM standard are supported. Please refer to [Supported/Unsupported Features](https://oneapi-src.github.io/ishmem/supported_features.html) to get a complete view.
12+
- Intel® SHMEM requires a one-to-one mapping of PEs to SYCL devices. This implies that Intel® SHMEM executions must launch with a number of processes on each compute node that is no more than the number of available SYCL devices on each one of those nodes. By default, the Intel® SHMEM runtime considers each individual device tile to make up a single SYCL device and assigns a tile per PE.
13+
- All collective operations within a kernel must complete before invoking subsequent kernel-initiated collective operation.
14+
- To run Intel® SHMEM with SOS enabling the Slingshot provider in OFI, environment variable `FI_CXI_OPTIMIZED_MRS=0` must be used. It is also recommended to use `FI_CXI_DEFAULT_CQ_SIZE=131072`.
15+
- To run Intel® SHMEM with SOS enabling the verbs provider, environment variable `MLX5_SCATTER_TO_CQE=0` must be used.
16+
- To run Intel® SHMEM with Intel® MPI Library, environment variable `I_MPI_OFFLOAD=1` must be used. Additionally, `I_MPI_OFFLOAD_RDMA=1` may be necessary for GPU RDMA depending on the OFI provider. Please refer to the [reference guide](https://www.intel.com/content/www/us/en/docs/mpi-library/developer-reference-linux/2021-16/gpu-buffers-support.html) for further details.
17+
- Inter-node communication in Intel® SHMEM requires [dma-buf](https://www.kernel.org/doc/html/latest/driver-api/dma-buf.html) support in the Linux kernel. Inter-node functionality in Intel® SHMEM Release 1.5.0 is tested with SUSE Linux Enterprise Server 15 SP4.
18+
- Support for Intel® Arc™ B-Series GPUs is preliminary. As such, not all APIs are currently supported.
19+
- When using Intel® Arc™ B-Series GPUs, environment variable `RenderCompressedBuffersEnabled=0` is required. This is automatically set when running with the launcher script `ishmrun`.
20+
421
## Release 1.5.0
522

623
### New Features and Enhancements

docs/source/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
project = u'Intel® SHMEM'
1717
copyright = u'2024 Intel Corporation licensed under Creative Commons BY 4.0'
1818
author = u'Intel Corporation'
19-
release = u'1.5.0'
19+
release = u'1.5.1'
2020
version = release
2121

2222
# -- General configuration ---------------------------------------------------

src/collectives/scan.cpp

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
/* Copyright (C) 2025 Intel Corporation
2+
* SPDX-License-Identifier: BSD-3-Clause
3+
*/
4+
5+
#include "collectives/scan_impl.h"
6+
7+
/* clang-format off */
8+
#define ISHMEMI_API_IMPL_INSCAN(TYPENAME, TYPE) \
9+
int ishmem_##TYPENAME##_sum_inscan(TYPE *dest, const TYPE *src, size_t nelems) { return ishmem_sum_inscan(ISHMEM_TEAM_WORLD, dest, src, nelems); } \
10+
int ishmem_##TYPENAME##_sum_inscan(ishmem_team_t team, TYPE *dest, const TYPE *src, size_t nelems) { return ishmem_sum_inscan(team, dest, src, nelems); } \
11+
sycl::event ishmemx_##TYPENAME##_sum_inscan_on_queue(TYPE *dest, const TYPE *src, size_t nelems, int *ret, sycl::queue &q, const std::vector<sycl::event> &deps) { \
12+
return ishmemx_sum_inscan_on_queue(ISHMEM_TEAM_WORLD, dest, src, nelems, ret, q, deps); \
13+
} \
14+
sycl::event ishmemx_##TYPENAME##_sum_inscan_on_queue(ishmem_team_t team, TYPE *dest, const TYPE *src, size_t nelems, int *ret, sycl::queue &q, const std::vector<sycl::event> &deps) { \
15+
return ishmemx_sum_inscan_on_queue(team, dest, src, nelems, ret, q, deps); \
16+
}
17+
#define ISHMEMI_API_IMPL_EXSCAN(TYPENAME, TYPE) \
18+
int ishmem_##TYPENAME##_sum_exscan(TYPE *dest, const TYPE *src, size_t nelems) { return ishmem_sum_exscan(ISHMEM_TEAM_WORLD, dest, src, nelems); } \
19+
int ishmem_##TYPENAME##_sum_exscan(ishmem_team_t team, TYPE *dest, const TYPE *src, size_t nelems) { return ishmem_sum_exscan(team, dest, src, nelems); } \
20+
sycl::event ishmemx_##TYPENAME##_sum_exscan_on_queue(TYPE *dest, const TYPE *src, size_t nelems, int *ret, sycl::queue &q, const std::vector<sycl::event> &deps) { \
21+
return ishmemx_sum_exscan_on_queue(ISHMEM_TEAM_WORLD, dest, src, nelems, ret, q, deps); \
22+
} \
23+
sycl::event ishmemx_##TYPENAME##_sum_exscan_on_queue(ishmem_team_t team, TYPE *dest, const TYPE *src, size_t nelems, int *ret, sycl::queue &q, const std::vector<sycl::event> &deps) { \
24+
return ishmemx_sum_exscan_on_queue(team, dest, src, nelems, ret, q, deps); \
25+
}
26+
/* clang-format on */
27+
28+
ISHMEMI_API_IMPL_INSCAN(float, float)
29+
ISHMEMI_API_IMPL_INSCAN(double, double)
30+
ISHMEMI_API_IMPL_INSCAN(char, char)
31+
ISHMEMI_API_IMPL_INSCAN(schar, signed char)
32+
ISHMEMI_API_IMPL_INSCAN(short, short)
33+
ISHMEMI_API_IMPL_INSCAN(int, int)
34+
ISHMEMI_API_IMPL_INSCAN(long, long)
35+
ISHMEMI_API_IMPL_INSCAN(longlong, long long)
36+
ISHMEMI_API_IMPL_INSCAN(uchar, unsigned char)
37+
ISHMEMI_API_IMPL_INSCAN(ushort, unsigned short)
38+
ISHMEMI_API_IMPL_INSCAN(uint, unsigned int)
39+
ISHMEMI_API_IMPL_INSCAN(ulong, unsigned long)
40+
ISHMEMI_API_IMPL_INSCAN(ulonglong, unsigned long long)
41+
ISHMEMI_API_IMPL_INSCAN(int8, int8_t)
42+
ISHMEMI_API_IMPL_INSCAN(int16, int16_t)
43+
ISHMEMI_API_IMPL_INSCAN(int32, int32_t)
44+
ISHMEMI_API_IMPL_INSCAN(int64, int64_t)
45+
ISHMEMI_API_IMPL_INSCAN(uint8, uint8_t)
46+
ISHMEMI_API_IMPL_INSCAN(uint16, uint16_t)
47+
ISHMEMI_API_IMPL_INSCAN(uint32, uint32_t)
48+
ISHMEMI_API_IMPL_INSCAN(uint64, uint64_t)
49+
ISHMEMI_API_IMPL_INSCAN(size, size_t)
50+
ISHMEMI_API_IMPL_INSCAN(ptrdiff, ptrdiff_t)
51+
52+
ISHMEMI_API_IMPL_EXSCAN(float, float)
53+
ISHMEMI_API_IMPL_EXSCAN(double, double)
54+
ISHMEMI_API_IMPL_EXSCAN(char, char)
55+
ISHMEMI_API_IMPL_EXSCAN(schar, signed char)
56+
ISHMEMI_API_IMPL_EXSCAN(short, short)
57+
ISHMEMI_API_IMPL_EXSCAN(int, int)
58+
ISHMEMI_API_IMPL_EXSCAN(long, long)
59+
ISHMEMI_API_IMPL_EXSCAN(longlong, long long)
60+
ISHMEMI_API_IMPL_EXSCAN(uchar, unsigned char)
61+
ISHMEMI_API_IMPL_EXSCAN(ushort, unsigned short)
62+
ISHMEMI_API_IMPL_EXSCAN(uint, unsigned int)
63+
ISHMEMI_API_IMPL_EXSCAN(ulong, unsigned long)
64+
ISHMEMI_API_IMPL_EXSCAN(ulonglong, unsigned long long)
65+
ISHMEMI_API_IMPL_EXSCAN(int8, int8_t)
66+
ISHMEMI_API_IMPL_EXSCAN(int16, int16_t)
67+
ISHMEMI_API_IMPL_EXSCAN(int32, int32_t)
68+
ISHMEMI_API_IMPL_EXSCAN(int64, int64_t)
69+
ISHMEMI_API_IMPL_EXSCAN(uint8, uint8_t)
70+
ISHMEMI_API_IMPL_EXSCAN(uint16, uint16_t)
71+
ISHMEMI_API_IMPL_EXSCAN(uint32, uint32_t)
72+
ISHMEMI_API_IMPL_EXSCAN(uint64, uint64_t)
73+
ISHMEMI_API_IMPL_EXSCAN(size, size_t)
74+
ISHMEMI_API_IMPL_EXSCAN(ptrdiff, ptrdiff_t)

src/collectives/scan_impl.h

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
/* Copyright (C) 2025 Intel Corporation
2+
* SPDX-License-Identifier: BSD-3-Clause
3+
*/
4+
5+
#ifndef COLLECTIVES_SCAN_IMPL_H
6+
#define COLLECTIVES_SCAN_IMPL_H
7+
8+
#include "collectives.h"
9+
#include "runtime.h"
10+
#include "proxy_impl.h"
11+
#include "on_queue.h"
12+
13+
template <typename T, ishmemi_op_t OP>
14+
int scan_impl(ishmem_team_t team, T *dest, const T *src, size_t nelems)
15+
{
16+
if constexpr (enable_error_checking) {
17+
validate_parameters((void *) dest, (void *) src, nelems * sizeof(T));
18+
}
19+
20+
ishmemi_request_t req;
21+
req.src = src;
22+
req.dst = dest;
23+
req.nelems = nelems;
24+
req.op = OP;
25+
req.type = ishmemi_union_get_base_type<T, OP>();
26+
req.team = team;
27+
28+
#ifdef __SYCL_DEVICE_ONLY__
29+
return ishmemi_proxy_blocking_request_status(req);
30+
#else
31+
ishmemi_ringcompletion_t comp;
32+
ishmemi_runtime->proxy_funcs[req.op][req.type](&req, &comp);
33+
return ishmemi_proxy_get_status(comp.completion.ret);
34+
#endif
35+
}
36+
37+
template <typename T>
38+
int ishmem_sum_inscan(ishmem_team_t team, T *dest, const T *src, size_t nelems)
39+
{
40+
return scan_impl<T, INSCAN>(team, dest, src, nelems);
41+
}
42+
43+
template <typename T>
44+
int ishmem_sum_exscan(ishmem_team_t team, T *dest, const T *src, size_t nelems)
45+
{
46+
return scan_impl<T, EXSCAN>(team, dest, src, nelems);
47+
}
48+
49+
template <typename T, ishmemi_op_t OP>
50+
sycl::event scan_on_queue_impl(ishmem_team_t team, T *dest, const T *src, size_t nelems, int *ret,
51+
sycl::queue &q, const std::vector<sycl::event> &deps)
52+
{
53+
bool entry_already_exists = true;
54+
const std::lock_guard<std::mutex> lock(ishmemi_on_queue_events_map.map_mtx);
55+
auto iter = ishmemi_on_queue_events_map.get_entry_info(q, entry_already_exists);
56+
57+
auto e = q.submit([&](sycl::handler &cgh) {
58+
set_cmd_grp_dependencies(cgh, entry_already_exists, iter->second->event, deps);
59+
cgh.single_task([=]() {
60+
int tmp_ret = scan_impl<T, OP>(team, dest, src, nelems);
61+
if (ret) *ret = tmp_ret;
62+
});
63+
});
64+
ishmemi_on_queue_events_map[&q]->event = e;
65+
return e;
66+
}
67+
68+
template <typename T>
69+
sycl::event ishmemx_sum_inscan_on_queue(ishmem_team_t team, T *dest, const T *src, size_t nelems,
70+
int *ret, sycl::queue &q,
71+
const std::vector<sycl::event> &deps)
72+
{
73+
return scan_on_queue_impl<T, INSCAN>(team, dest, src, nelems, ret, q, deps);
74+
}
75+
76+
template <typename T>
77+
sycl::event ishmemx_sum_inscan_on_queue(T *dest, const T *src, size_t nelems, int *ret,
78+
sycl::queue &q, const std::vector<sycl::event> &deps)
79+
{
80+
return scan_on_queue_impl<T, INSCAN>(ISHMEM_TEAM_WORLD, dest, src, nelems, ret, q, deps);
81+
}
82+
83+
template <typename T>
84+
sycl::event ishmemx_sum_exscan_on_queue(ishmem_team_t team, T *dest, const T *src, size_t nelems,
85+
int *ret, sycl::queue &q,
86+
const std::vector<sycl::event> &deps)
87+
{
88+
return scan_on_queue_impl<T, EXSCAN>(team, dest, src, nelems, ret, q, deps);
89+
}
90+
91+
template <typename T>
92+
sycl::event ishmemx_sum_exscan_on_queue(T *dest, const T *src, size_t nelems, int *ret,
93+
sycl::queue &q, const std::vector<sycl::event> &deps)
94+
{
95+
return scan_on_queue_impl<T, EXSCAN>(ISHMEM_TEAM_WORLD, dest, src, nelems, ret, q, deps);
96+
}
97+
98+
#endif

src/collectives/sync_impl.h

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
/* Copyright (C) 2025 Intel Corporation
2+
* SPDX-License-Identifier: BSD-3-Clause
3+
*/
4+
5+
#ifndef COLLECTIVES_SYNC_IMPL_H
6+
#define COLLECTIVES_SYNC_IMPL_H
7+
8+
#include "ishmem/types.h"
9+
#include "proxy_impl.h"
10+
#include "collectives.h"
11+
#include "runtime.h"
12+
#include "teams.h"
13+
14+
static inline void sync_team_fallback(ishmem_team_t team)
15+
{
16+
ishmemi_request_t req;
17+
ishmemi_ringcompletion_t comp __attribute__((unused));
18+
req.op = TEAM_SYNC;
19+
req.type = NONE;
20+
req.team = team;
21+
22+
#ifdef __SYCL_DEVICE_ONLY__
23+
ishmemi_proxy_blocking_request(req);
24+
atomic_fence(sycl::memory_order::seq_cst, sycl::memory_scope::system);
25+
#else
26+
ishmemi_runtime->proxy_funcs[req.op][req.type](&req, &comp);
27+
#endif
28+
}
29+
30+
ISHMEM_DEVICE_ATTRIBUTES inline void ishmemi_team_sync(ishmem_team_t team)
31+
{
32+
/* Node-local, on-device implementation */
33+
if constexpr (ishmemi_is_device) {
34+
ishmemi_info_t *info = global_info;
35+
ishmemi_team_device_t *team_ptr = &info->team_device_pool[team];
36+
if (team_ptr->only_intra) {
37+
sycl::atomic_fence(sycl::memory_order::seq_cst, sycl::memory_scope::system);
38+
39+
int index = team_ptr->psync_idx;
40+
long *my_psync = &team_ptr->psync[team_ptr->psync_idx];
41+
team_ptr->psync_idx = (index + 1) % N_PSYNCS_PER_TEAM;
42+
43+
/* This atomic has to be seq_cst because we definitely want it to happen in order */
44+
sycl::atomic_ref<long, sycl::memory_order::seq_cst, sycl::memory_scope::system,
45+
sycl::access::address_space::global_space>
46+
atomic_psync(*my_psync);
47+
48+
for (int i = team_ptr->start; i <= team_ptr->last_pe; i += team_ptr->stride) {
49+
uint8_t local_index = ISHMEMI_LOCAL_PES[i];
50+
long *remote_psync = ISHMEMI_FAST_ADJUST(long, info, local_index, my_psync);
51+
52+
/* These atomics can be relaxed because we don't care about their ordering */
53+
sycl::atomic_ref<long, sycl::memory_order::relaxed, sycl::memory_scope::system,
54+
sycl::access::address_space::global_space>
55+
atomic_psync(*remote_psync);
56+
atomic_psync += 1L;
57+
}
58+
59+
while (atomic_psync.load() != team_ptr->size)
60+
;
61+
atomic_psync.store(0);
62+
63+
return;
64+
}
65+
}
66+
67+
/* Otherwise */
68+
sync_team_fallback(team);
69+
}
70+
71+
#endif

src/ishmem.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
#define ISHMEM_MAJOR_VERSION 1
1818
#define ISHMEM_MINOR_VERSION 5
19-
#define ISHMEM_PATCH_VERSION 0
19+
#define ISHMEM_PATCH_VERSION 1
2020
#define ISHMEM_MAX_NAME_LEN 256
2121
#define ISHMEM_VENDOR_STRING "Intel® SHMEM"
2222

test/unit/exscan.cpp

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
/* Copyright (C) 2025 Intel Corporation
2+
* SPDX-License-Identifier: BSD-3-Clause
3+
*/
4+
5+
#include <ishmem_tester.h>
6+
#include "rma_test.h"
7+
8+
#undef TEST_BRANCH_ON_QUEUE
9+
10+
#define TEST_BRANCH_SINGLE(testname, typeenum, typename, type, op, opname) \
11+
*res = ishmem_##typename##_##testname(ISHMEM_TEAM_WORLD, (type *) dest, (type *) src, nelems);
12+
13+
#define TEST_BRANCH_ON_QUEUE(testname, typeenum, typename, type, op, opname) \
14+
ishmemx_##typename##_##testname##_on_queue((type *) dest, (type *) src, nelems, res, q);
15+
16+
GEN_HOST_FNS(sum_exscan, host, NOP, nop)
17+
GEN_ON_QUEUE_FNS(sum_exscan, on_queue, NOP, nop)
18+
GEN_SINGLE_FNS(sum_exscan, single, NOP, nop)
19+
20+
class sum_exscan_tester : public ishmem_tester {
21+
public:
22+
sum_exscan_tester(int argc, char *argv[]) : ishmem_tester(argc, argv, true) {}
23+
virtual size_t create_source_pattern(ishmemi_type_t t, ishmemi_op_t op, testmode_t mode,
24+
size_t nelems);
25+
virtual size_t create_check_pattern(ishmemi_type_t t, ishmemi_op_t op, testmode_t mode,
26+
size_t nelems);
27+
};
28+
29+
/* result written into aligned_source, using host_source as a temp buffer if needed */
30+
size_t sum_exscan_tester::create_source_pattern(ishmemi_type_t t, ishmemi_op_t op, testmode_t mode,
31+
size_t nelems)
32+
{
33+
size_t test_size = nelems * typesize(t);
34+
size_t from_pe = (size_t) my_pe;
35+
for (size_t idx = 0; idx < ((test_size / sizeof(long)) + 1); idx += 1) {
36+
aligned_source[idx] = (long) from_pe + (long) idx;
37+
if (patterndebugflag && (idx < 16)) {
38+
printf("[%d] source pattern idx %lu val %016lx\n", my_pe, idx, aligned_source[idx]);
39+
}
40+
}
41+
42+
return (test_size);
43+
}
44+
45+
/* check pattern written into host_check, using host_source as a temp buffer */
46+
size_t sum_exscan_tester::create_check_pattern(ishmemi_type_t t, ishmemi_op_t op, testmode_t mode,
47+
size_t nelems)
48+
{
49+
size_t test_size = nelems * typesize(t);
50+
size_t to_pe = (size_t) my_pe;
51+
for (size_t idx = 0; idx < ((test_size / sizeof(long)) + 1); idx += 1) {
52+
host_check[idx] = ((to_pe + idx) * (to_pe + idx - 1) / 2 - (idx * (idx - 1)) / 2);
53+
if (patterndebugflag && (idx < 16)) {
54+
printf("[%d] check pattern idx %lu val %016lx\n", my_pe, idx, host_check[idx]);
55+
}
56+
}
57+
58+
return (test_size);
59+
}
60+
61+
int main(int argc, char **argv)
62+
{
63+
class sum_exscan_tester t(argc, argv);
64+
65+
size_t bufsize = (t.max_nelems * t.typesize(SIZE128) * (size_t) t.n_pes) + 4096L;
66+
t.alloc_memory(bufsize);
67+
size_t errors = 0;
68+
69+
GEN_FN_TABLE(sum_exscan, host, NOP, nop)
70+
GEN_FN_TABLE(sum_exscan, on_queue, NOP, nop)
71+
GEN_FN_TABLE(sum_exscan, single, NOP, nop)
72+
73+
if (!t.test_types_set) t.add_test_type_list(scan_types);
74+
errors += t.run_aligned_tests(NOP);
75+
errors += t.run_offset_tests(NOP);
76+
77+
return (t.finalize_and_report(errors));
78+
}

0 commit comments

Comments
 (0)