microsoft
diff --git a/‎csrc/setu/client/Client.cpp‎
Lines changed: 14 additions & 4 deletions b/‎csrc/setu/client/Client.cpp‎
Lines changed: 14 additions & 4 deletions
diff --git a/‎csrc/setu/client/Client.h‎
Lines changed: 6 additions & 2 deletions b/‎csrc/setu/client/Client.h‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎csrc/setu/client/Pybind.cpp‎
Lines changed: 7 additions & 0 deletions b/‎csrc/setu/client/Pybind.cpp‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎csrc/setu/commons/datatypes/Device.h‎
Lines changed: 2 additions & 1 deletion b/‎csrc/setu/commons/datatypes/Device.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎csrc/setu/coordinator/Coordinator.cpp‎
Lines changed: 30 additions & 24 deletions b/‎csrc/setu/coordinator/Coordinator.cpp‎
Lines changed: 30 additions & 24 deletions
@@ -154,7 +154,8 @@ std::optional<TensorShardRef> Client::RegisterTensorShard(
   return response.shard_ref;
 }
 
-std::optional<CopyOperationId> Client::SubmitCopy(const CopySpec& copy_spec) {
+std::optional<CopyOperationId> Client::SubmitCopy(
+    const CopySpec& copy_spec, const std::vector<CompilerHint>& hints) {
   // Find all shards owned by this client that are involved in the copy
   // (either as source or destination)
   std::vector<ShardId> involved_shards;
@@ -175,10 +176,14 @@ std::optional<CopyOperationId> Client::SubmitCopy(const CopySpec& copy_spec) {
                        "Client has no shards for src {} or dst {}",
                        copy_spec.src_name, copy_spec.dst_name);
 
+  // Compute fingerprint once for all shard submissions
+  const auto fingerprint = setu::planner::hints::Fingerprint(hints);
+
   // Submit a request for each involved shard
   std::optional<CopyOperationId> copy_op_id;
   for (const auto& shard_id : involved_shards) {
-    ClientRequest request = SubmitCopyRequest(shard_id, copy_spec);
+    ClientRequest request =
+        SubmitCopyRequest(shard_id, copy_spec, hints, fingerprint);
     Comm::Send(request_socket_, request);
 
     auto response = Comm::Recv<SubmitCopyResponse>(request_socket_);
@@ -196,18 +201,23 @@ std::optional<CopyOperationId> Client::SubmitCopy(const CopySpec& copy_spec) {
   return copy_op_id;
 }
 
-std::optional<CopyOperationId> Client::SubmitPull(const CopySpec& copy_spec) {
+std::optional<CopyOperationId> Client::SubmitPull(
+    const CopySpec& copy_spec, const std::vector<CompilerHint>& hints) {
   // For Pull: only destination shards submit (one-sided operation)
   auto it = tensor_shards_.find(copy_spec.dst_name);
   ASSERT_VALID_RUNTIME(it != tensor_shards_.end(),
                        "Client has no shards for dst {}", copy_spec.dst_name);
 
+  // Compute fingerprint once for all shard submissions
+  const auto fingerprint = setu::planner::hints::Fingerprint(hints);
+
   // Submit a request for each destination shard
   std::optional<CopyOperationId> copy_op_id;
   for (const auto& shard_ref : it->second) {
     const auto shard_id = shard_ref->shard_id;
 
-    ClientRequest request = SubmitPullRequest(shard_id, copy_spec);
+    ClientRequest request =
+        SubmitPullRequest(shard_id, copy_spec, hints, fingerprint);
     Comm::Send(request_socket_, request);
 
     auto response = Comm::Recv<SubmitCopyResponse>(request_socket_);
 
@@ -26,6 +26,7 @@
 #include "commons/utils/TorchTensorIPC.h"
 #include "commons/utils/ZmqHelper.h"
 #include "messaging/GetTensorHandleResponse.h"
+#include "planner/hints/Hint.h"
 
 namespace setu::client {
 using setu::commons::CopyOperationId;
@@ -41,6 +42,7 @@ using setu::commons::messages::GetTensorHandleResponse;
 using setu::commons::utils::TensorIPCSpec;
 using setu::commons::utils::ZmqContextPtr;
 using setu::commons::utils::ZmqSocketPtr;
+using setu::planner::hints::CompilerHint;
 
 class Client {
  public:
@@ -58,9 +60,11 @@ class Client {
   std::optional<TensorShardRef> RegisterTensorShard(
       const TensorShardSpec& shard_spec);
 
-  std::optional<CopyOperationId> SubmitCopy(const CopySpec& copy_spec);
+  std::optional<CopyOperationId> SubmitCopy(
+      const CopySpec& copy_spec, const std::vector<CompilerHint>& hints = {});
 
-  std::optional<CopyOperationId> SubmitPull(const CopySpec& copy_spec);
+  std::optional<CopyOperationId> SubmitPull(
+      const CopySpec& copy_spec, const std::vector<CompilerHint>& hints = {});
 
   void WaitForCopy(CopyOperationId copy_op_id);
 
 
@@ -16,20 +16,25 @@
 //==============================================================================
 #include "commons/utils/Pybind.h"
 
+#include <pybind11/stl.h>
+
 #include "client/Client.h"
 #include "commons/Logging.h"
 #include "commons/StdCommon.h"
 #include "commons/TorchCommon.h"
 #include "commons/datatypes/CopySpec.h"
 #include "commons/datatypes/TensorShardSpec.h"
 #include "commons/enums/Enums.h"
+#include "planner/hints/Hint.h"
 //==============================================================================
 namespace setu::client {
 //==============================================================================
 using setu::commons::CopyOperationId;
 using setu::commons::datatypes::CopySpec;
 using setu::commons::datatypes::TensorShardSpec;
 using setu::commons::enums::ErrorCode;
+using setu::planner::hints::CompilerHint;
+using setu::planner::hints::RoutingHint;
 //==============================================================================
 void InitClientPybindClass(py::module_& m) {
   py::class_<Client, std::shared_ptr<Client>>(m, "Client")
@@ -46,8 +51,10 @@ void InitClientPybindClass(py::module_& m) {
            py::arg("shard_spec"),
            "Register a tensor shard and return a reference to it")
       .def("submit_copy", &Client::SubmitCopy, py::arg("copy_spec"),
+           py::arg("hints") = std::vector<CompilerHint>{},
            "Submit a copy operation and return an operation ID")
       .def("submit_pull", &Client::SubmitPull, py::arg("copy_spec"),
+           py::arg("hints") = std::vector<CompilerHint>{},
            "Submit a pull operation and return an operation ID")
       .def("wait_for_copy", &Client::WaitForCopy, py::arg("copy_op_id"),
            "Wait for a copy operation to complete")
 
@@ -98,7 +98,8 @@ struct Device {
     return static_cast<std::int16_t>(torch_device.index());
   }
 
-  torch::Device torch_device;  ///< PyTorch device (type + local index)
+  torch::Device torch_device{
+      torch::kCUDA};  ///< PyTorch device (type + local index)
 };
 //==============================================================================
 }  // namespace setu::commons::datatypes
 
@@ -60,9 +60,8 @@ Coordinator::Coordinator(std::size_t port, PlannerPtr planner)
 
   handler_ = std::make_unique<Handler>(inbox_queue_, outbox_queue_, metastore_,
                                        planner_queue_, outbox_notify);
-  executor_ =
-      std::make_unique<Executor>(planner_queue_, outbox_queue_, metastore_,
-                                 *planner_, hint_store_, outbox_notify);
+  executor_ = std::make_unique<Executor>(planner_queue_, outbox_queue_,
+                                         metastore_, *planner_, outbox_notify);
 }
 
 Coordinator::~Coordinator() {
@@ -108,12 +107,6 @@ std::optional<CopyOperationId> Coordinator::SubmitCopy(
   return std::nullopt;
 }
 
-void Coordinator::AddHint(setu::planner::hints::CompilerHint hint) {
-  hint_store_.AddHint(std::move(hint));
-}
-
-void Coordinator::ClearHints() { hint_store_.Clear(); }
-
 void Coordinator::PlanExecuted(CopyOperationId copy_op_id) {
   LOG_DEBUG("Plan executed for copy operation ID: {}", copy_op_id);
 
@@ -387,7 +380,8 @@ void Coordinator::Handler::HandleSubmitCopyRequest(
       metastore_.GetNumShardsForTensor(request.copy_spec.dst_name);
 
   HandleShardSubmission(node_agent_identity, request.request_id,
-                        request.shard_id, request.copy_spec, expected_shards);
+                        request.shard_id, request.copy_spec, expected_shards,
+                        std::vector(request.hints), request.hints_fingerprint);
 }
 
 void Coordinator::Handler::HandleSubmitPullRequest(
@@ -413,18 +407,21 @@ void Coordinator::Handler::HandleSubmitPullRequest(
       metastore_.GetNumShardsForTensor(request.copy_spec.dst_name);
 
   HandleShardSubmission(node_agent_identity, request.request_id,
-                        request.shard_id, request.copy_spec, expected_shards);
+                        request.shard_id, request.copy_spec, expected_shards,
+                        std::vector(request.hints), request.hints_fingerprint);
 }
 
 void Coordinator::Handler::HandleShardSubmission(
     const Identity& node_agent_identity, const RequestId& request_id,
     const ShardId& shard_id, const CopySpec& copy_spec,
-    std::size_t expected_shards) {
+    std::size_t expected_shards,
+    std::vector<setu::planner::hints::CompilerHint> hints,
+    std::uint64_t hints_fingerprint) {
   using setu::commons::utils::AggregationParticipant;
 
   CopyKey copy_key{copy_spec.src_name, copy_spec.dst_name};
 
-  auto result = shard_aggregator_.Submit(
+  auto result = pending_dispatch_.SubmitShard(
       copy_key, shard_id, copy_spec,
       AggregationParticipant{node_agent_identity, request_id}, expected_shards,
       [](const CopySpec& stored, const CopySpec& incoming) {
@@ -437,7 +434,8 @@ void Coordinator::Handler::HandleShardSubmission(
             *incoming.dst_selection == *stored.dst_selection,
             "Shard submission {} -> {}: destination selection mismatch",
             incoming.src_name, incoming.dst_name);
-      });
+      },
+      std::move(hints), hints_fingerprint);
 
   if (!result.has_value()) {
     return;
@@ -459,14 +457,15 @@ void Coordinator::Handler::HandleShardSubmission(
   }
 
   // Create shared state with submitter identities
-  auto state = std::make_shared<CopyOperationState>(result->payload,
-                                                    std::move(submitters));
+  auto state =
+      std::make_shared<CopyOperationState>(result->spec, std::move(submitters));
 
   // Store the shared state (will be accessed by HandleExecuteResponse)
   copy_operations_.emplace(copy_op_id, state);
 
-  // Add to planner queue with copy_op_id and shared state
-  planner_queue_.push(PlannerTask{copy_op_id, result->payload, state});
+  // Add to planner queue with copy_op_id, shared state, and per-op hints
+  planner_queue_.push(PlannerTask{copy_op_id, result->spec, state,
+                                  HintStore(std::move(result->hints))});
 
   // Send responses to all waiting participants with copy_op_id
   for (const auto& participant : result->participants) {
@@ -588,15 +587,25 @@ void Coordinator::Handler::HandleDeregisterShardsRequest(
     metastore_.MarkTensorDeregistered(name);
   }
 
-  // Cancel partial entries in the shard aggregator for these tensors.
+  // Cancel partial entries in the pending dispatch for these tensors.
   // This cleans up groups that will never complete because the shards are
   // going away.
   auto cancelled_participants =
-      shard_aggregator_.CancelIf([&tensor_names](const CopyKey& key) {
+      pending_dispatch_.CancelIf([&tensor_names](const CopyKey& key) {
         return tensor_names.contains(key.src_name) ||
                tensor_names.contains(key.dst_name);
       });
 
+  // Clean up per-operation hint tracking for cancelled operations
+  std::erase_if(operation_hints_, [&tensor_names](const auto& entry) {
+    return tensor_names.contains(entry.first.src_name) ||
+           tensor_names.contains(entry.first.dst_name);
+  });
+  std::erase_if(operation_fingerprints_, [&tensor_names](const auto& entry) {
+    return tensor_names.contains(entry.first.src_name) ||
+           tensor_names.contains(entry.first.dst_name);
+  });
+
   // Send error responses to cancelled participants
   for (const auto& participant : cancelled_participants) {
     LOG_INFO(
@@ -651,13 +660,11 @@ void Coordinator::Handler::HandleDeregisterShardsRequest(
 Coordinator::Executor::Executor(Queue<PlannerTask>& planner_queue,
                                 Queue<OutboxMessage>& outbox_queue,
                                 MetaStore& metastore, Planner& planner,
-                                HintStore& hint_store,
                                 OutboxNotifyFn outbox_notify)
     : planner_queue_(planner_queue),
       outbox_queue_(outbox_queue),
       metastore_(metastore),
       planner_(planner),
-      hint_store_(hint_store),
       outbox_notify_(std::move(outbox_notify)) {}
 
 void Coordinator::Executor::PushOutbox(OutboxMessage msg) {
@@ -690,9 +697,8 @@ void Coordinator::Executor::Loop() {
 
       LOG_DEBUG("Executor received task for copy_op_id: {}", task.copy_op_id);
 
-      auto hints = hint_store_.Snapshot();
       auto t_compile_start = std::chrono::steady_clock::now();
-      Plan plan = planner_.Compile(task.copy_spec, metastore_, hints);
+      Plan plan = planner_.Compile(task.copy_spec, metastore_, task.hints);
       auto t_compile_end = std::chrono::steady_clock::now();
 
       LOG_DEBUG("Compiled plan:\n{}", plan);
Original file line number	Diff line number	Diff line change
`@@ -98,7 +98,8 @@ struct Device {`
`98`	`98`	`return static_cast<std::int16_t>(torch_device.index());`
`99`	`99`	`}`
`100`	`100`
`101`		`- torch::Device torch_device; ///< PyTorch device (type + local index)`
	`101`	`+ torch::Device torch_device{`
	`102`	`+ torch::kCUDA}; ///< PyTorch device (type + local index)`
`102`	`103`	`};`
`103`	`104`	`//==============================================================================`
`104`	`105`	`} // namespace setu::commons::datatypes`