microsoft
diff --git a/‎csrc/setu/client/Client.cpp‎
Lines changed: 14 additions & 4 deletions b/‎csrc/setu/client/Client.cpp‎
Lines changed: 14 additions & 4 deletions
diff --git a/‎csrc/setu/client/Client.h‎
Lines changed: 6 additions & 2 deletions b/‎csrc/setu/client/Client.h‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎csrc/setu/client/Pybind.cpp‎
Lines changed: 7 additions & 0 deletions b/‎csrc/setu/client/Pybind.cpp‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎csrc/setu/commons/datatypes/Device.h‎
Lines changed: 2 additions & 1 deletion b/‎csrc/setu/commons/datatypes/Device.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎csrc/setu/commons/utils/ShardAggregator.h‎
Lines changed: 26 additions & 6 deletions b/‎csrc/setu/commons/utils/ShardAggregator.h‎
Lines changed: 26 additions & 6 deletions
@@ -154,7 +154,8 @@ std::optional<TensorShardRef> Client::RegisterTensorShard(
   return response.shard_ref;
 }
 
-std::optional<CopyOperationId> Client::SubmitCopy(const CopySpec& copy_spec) {
+std::optional<CopyOperationId> Client::SubmitCopy(
+    const CopySpec& copy_spec, const std::vector<CompilerHint>& hints) {
   // Find all shards owned by this client that are involved in the copy
   // (either as source or destination)
   std::vector<ShardId> involved_shards;
@@ -175,10 +176,14 @@ std::optional<CopyOperationId> Client::SubmitCopy(const CopySpec& copy_spec) {
                        "Client has no shards for src {} or dst {}",
                        copy_spec.src_name, copy_spec.dst_name);
 
+  // Compute fingerprint once for all shard submissions
+  const auto fingerprint = setu::planner::hints::Fingerprint(hints);
+
   // Submit a request for each involved shard
   std::optional<CopyOperationId> copy_op_id;
   for (const auto& shard_id : involved_shards) {
-    ClientRequest request = SubmitCopyRequest(shard_id, copy_spec);
+    ClientRequest request =
+        SubmitCopyRequest(shard_id, copy_spec, hints, fingerprint);
     Comm::Send(request_socket_, request);
 
     auto response = Comm::Recv<SubmitCopyResponse>(request_socket_);
@@ -196,18 +201,23 @@ std::optional<CopyOperationId> Client::SubmitCopy(const CopySpec& copy_spec) {
   return copy_op_id;
 }
 
-std::optional<CopyOperationId> Client::SubmitPull(const CopySpec& copy_spec) {
+std::optional<CopyOperationId> Client::SubmitPull(
+    const CopySpec& copy_spec, const std::vector<CompilerHint>& hints) {
   // For Pull: only destination shards submit (one-sided operation)
   auto it = tensor_shards_.find(copy_spec.dst_name);
   ASSERT_VALID_RUNTIME(it != tensor_shards_.end(),
                        "Client has no shards for dst {}", copy_spec.dst_name);
 
+  // Compute fingerprint once for all shard submissions
+  const auto fingerprint = setu::planner::hints::Fingerprint(hints);
+
   // Submit a request for each destination shard
   std::optional<CopyOperationId> copy_op_id;
   for (const auto& shard_ref : it->second) {
     const auto shard_id = shard_ref->shard_id;
 
-    ClientRequest request = SubmitPullRequest(shard_id, copy_spec);
+    ClientRequest request =
+        SubmitPullRequest(shard_id, copy_spec, hints, fingerprint);
     Comm::Send(request_socket_, request);
 
     auto response = Comm::Recv<SubmitCopyResponse>(request_socket_);
 
@@ -26,6 +26,7 @@
 #include "commons/utils/TorchTensorIPC.h"
 #include "commons/utils/ZmqHelper.h"
 #include "messaging/GetTensorHandleResponse.h"
+#include "planner/hints/Hint.h"
 
 namespace setu::client {
 using setu::commons::CopyOperationId;
@@ -41,6 +42,7 @@ using setu::commons::messages::GetTensorHandleResponse;
 using setu::commons::utils::TensorIPCSpec;
 using setu::commons::utils::ZmqContextPtr;
 using setu::commons::utils::ZmqSocketPtr;
+using setu::planner::hints::CompilerHint;
 
 class Client {
  public:
@@ -58,9 +60,11 @@ class Client {
   std::optional<TensorShardRef> RegisterTensorShard(
       const TensorShardSpec& shard_spec);
 
-  std::optional<CopyOperationId> SubmitCopy(const CopySpec& copy_spec);
+  std::optional<CopyOperationId> SubmitCopy(
+      const CopySpec& copy_spec, const std::vector<CompilerHint>& hints = {});
 
-  std::optional<CopyOperationId> SubmitPull(const CopySpec& copy_spec);
+  std::optional<CopyOperationId> SubmitPull(
+      const CopySpec& copy_spec, const std::vector<CompilerHint>& hints = {});
 
   void WaitForCopy(CopyOperationId copy_op_id);
 
 
@@ -16,20 +16,25 @@
 //==============================================================================
 #include "commons/utils/Pybind.h"
 
+#include <pybind11/stl.h>
+
 #include "client/Client.h"
 #include "commons/Logging.h"
 #include "commons/StdCommon.h"
 #include "commons/TorchCommon.h"
 #include "commons/datatypes/CopySpec.h"
 #include "commons/datatypes/TensorShardSpec.h"
 #include "commons/enums/Enums.h"
+#include "planner/hints/Hint.h"
 //==============================================================================
 namespace setu::client {
 //==============================================================================
 using setu::commons::CopyOperationId;
 using setu::commons::datatypes::CopySpec;
 using setu::commons::datatypes::TensorShardSpec;
 using setu::commons::enums::ErrorCode;
+using setu::planner::hints::CompilerHint;
+using setu::planner::hints::RoutingHint;
 //==============================================================================
 void InitClientPybindClass(py::module_& m) {
   py::class_<Client, std::shared_ptr<Client>>(m, "Client")
@@ -46,8 +51,10 @@ void InitClientPybindClass(py::module_& m) {
            py::arg("shard_spec"),
            "Register a tensor shard and return a reference to it")
       .def("submit_copy", &Client::SubmitCopy, py::arg("copy_spec"),
+           py::arg("hints") = std::vector<CompilerHint>{},
            "Submit a copy operation and return an operation ID")
       .def("submit_pull", &Client::SubmitPull, py::arg("copy_spec"),
+           py::arg("hints") = std::vector<CompilerHint>{},
            "Submit a pull operation and return an operation ID")
       .def("wait_for_copy", &Client::WaitForCopy, py::arg("copy_op_id"),
            "Wait for a copy operation to complete")
 
@@ -98,7 +98,8 @@ struct Device {
     return static_cast<std::int16_t>(torch_device.index());
   }
 
-  torch::Device torch_device;  ///< PyTorch device (type + local index)
+  torch::Device torch_device{
+      torch::kCUDA};  ///< PyTorch device (type + local index)
 };
 //==============================================================================
 }  // namespace setu::commons::datatypes
 
@@ -47,10 +47,14 @@ struct CompletedGroup {
 /// validation, and participant info. When all expected shards have arrived, the
 /// completed group is returned and the internal state is cleaned up.
 ///
-/// @tparam KeyType The group key type (must support operator<).
+/// @tparam KeyType The group key type (must be hashable via KeyHash).
 /// @tparam PayloadType The payload type stored per group. Must support
 /// operator== for validation of consistency across submissions.
-template <typename KeyType, typename PayloadType>
+/// @tparam KeyHash Hash function object for KeyType.
+/// @tparam KeyEqual Equality function object for KeyType.
+template <typename KeyType, typename PayloadType,
+          typename KeyHash = boost::hash<KeyType>,
+          typename KeyEqual = std::equal_to<KeyType>>
 class ShardAggregator {
  public:
   /// @brief Submit a shard for aggregation.
@@ -63,7 +67,8 @@ class ShardAggregator {
   /// @param participant [in] The identity and request_id of the submitter.
   /// @param expected_count [in] Total number of shards expected for this group.
   /// @param validate_fn [in] Callable(const PayloadType& stored, const
-  ///   PayloadType& incoming) that asserts payload consistency.
+  ///   PayloadType& incoming) → bool. Returns true if payloads are consistent,
+  ///   false to reject and cancel the group.
   /// @return CompletedGroup if this submission completes the group, nullopt
   ///   otherwise.
   template <typename ValidateFn>
@@ -82,8 +87,8 @@ class ShardAggregator {
     // Store or validate the payload
     if (!group.payload.has_value()) {
       group.payload.emplace(payload);
-    } else {
-      validate_fn(group.payload.value(), payload);
+    } else if (!validate_fn(group.payload.value(), payload)) {
+      return std::nullopt;
     }
 
     group.shards_received.insert(shard_id);
@@ -100,6 +105,21 @@ class ShardAggregator {
     return std::nullopt;
   }
 
+  /// @brief Cancel and remove the group for a specific key.
+  ///
+  /// @param key [in] The group key to cancel.
+  /// @return All participants from the cancelled group.
+  [[nodiscard]] std::vector<AggregationParticipant> Cancel(
+      const KeyType& key /*[in]*/) {
+    std::vector<AggregationParticipant> cancelled_participants;
+    auto it = groups_.find(key);
+    if (it != groups_.end()) {
+      cancelled_participants = std::move(it->second.participants);
+      groups_.erase(it);
+    }
+    return cancelled_participants;
+  }
+
   /// @brief Cancel and remove all groups whose key matches the predicate.
   ///
   /// This is used to clean up partially-aggregated groups when the shards
@@ -134,7 +154,7 @@ class ShardAggregator {
     std::vector<AggregationParticipant> participants;
   };
 
-  std::map<KeyType, PendingGroup> groups_;
+  std::unordered_map<KeyType, PendingGroup, KeyHash, KeyEqual> groups_;
 };
 
 //==============================================================================
Original file line number	Diff line number	Diff line change
`@@ -98,7 +98,8 @@ struct Device {`
`98`	`98`	`return static_cast<std::int16_t>(torch_device.index());`
`99`	`99`	`}`
`100`	`100`
`101`		`- torch::Device torch_device; ///< PyTorch device (type + local index)`
	`101`	`+ torch::Device torch_device{`
	`102`	`+ torch::kCUDA}; ///< PyTorch device (type + local index)`
`102`	`103`	`};`
`103`	`104`	`//==============================================================================`
`104`	`105`	`} // namespace setu::commons::datatypes`