NVIDIA · rg20 · Apr 23, 2026 · Apr 23, 2026 · Apr 24, 2026 · Apr 24, 2026
@@ -74,14 +74,17 @@
 #define CUOPT_MIP_BATCH_PDLP_RELIABILITY_BRANCHING "mip_batch_pdlp_reliability_branching"
 #define CUOPT_MIP_STRONG_BRANCHING_SIMPLEX_ITERATION_LIMIT \
   "mip_strong_branching_simplex_iteration_limit"
-#define CUOPT_SOLUTION_FILE            "solution_file"
-#define CUOPT_NUM_CPU_THREADS          "num_cpu_threads"
-#define CUOPT_NUM_GPUS                 "num_gpus"
-#define CUOPT_USER_PROBLEM_FILE        "user_problem_file"
-#define CUOPT_PRESOLVE_FILE            "presolve_file"
-#define CUOPT_RANDOM_SEED              "random_seed"
-#define CUOPT_PDLP_PRECISION           "pdlp_precision"
-#define CUOPT_MIP_SEMICONTINUOUS_BIG_M "mip_semi_continuous_big_m"
+
+#define CUOPT_SOLUTION_FILE                "solution_file"
+#define CUOPT_NUM_CPU_THREADS              "num_cpu_threads"
+#define CUOPT_NUM_GPUS                     "num_gpus"
+#define CUOPT_USER_PROBLEM_FILE            "user_problem_file"
+#define CUOPT_PRESOLVE_FILE                "presolve_file"
+#define CUOPT_RANDOM_SEED                  "random_seed"
+#define CUOPT_PDLP_PRECISION               "pdlp_precision"
+#define CUOPT_MIP_SEMICONTINUOUS_BIG_M     "mip_semi_continuous_big_m"
+#define CUOPT_BARRIER_ITERATIVE_REFINEMENT "barrier_iterative_refinement"
+#define CUOPT_BARRIER_STEP_SCALE           "barrier_step_scale"
 
 #define CUOPT_MIP_HYPER_HEURISTIC_POPULATION_SIZE     "mip_hyper_heuristic_population_size"
 #define CUOPT_MIP_HYPER_HEURISTIC_NUM_CPUFJ_THREADS   "mip_hyper_heuristic_num_cpufj_threads"
@@ -186,4 +189,7 @@
 #define CUOPT_MIP_SCALING_ON           1
 #define CUOPT_MIP_SCALING_NO_OBJECTIVE 2
 
+#define CUOPT_BARRIER_ITERATIVE_REFINEMENT_OFF 0
+#define CUOPT_BARRIER_ITERATIVE_REFINEMENT_ON  1
+
 #endif  // CUOPT_CONSTANTS_H
@@ -282,6 +282,8 @@ class pdlp_solver_settings_t {
   i_t barrier_dual_initial_point{-1};
   bool eliminate_dense_columns{true};
   pdlp_precision_t pdlp_precision{pdlp_precision_t::DefaultPrecision};
+  bool barrier_iterative_refinement{true};
+  f_t barrier_step_scale{0.9};
   bool save_best_primal_so_far{false};
   /**
    * @brief Stop the solver as soon as a primal feasible iterate is encountered.

@@ -19,12 +19,6 @@
 #include <rmm/device_uvector.hpp>
 namespace cuopt::linear_programming::dual_simplex {
 
-template <typename i_t, typename f_t>
-struct barrier_solver_settings_t {
-  i_t iteration_limit = 1000;
-  f_t step_scale      = 0.9;
-};
-
 template <typename i_t, typename f_t>
 class iteration_data_t;  // Forward declare
 
@@ -34,15 +28,12 @@ class barrier_solver_t {
   barrier_solver_t(const lp_problem_t<i_t, f_t>& lp,
                    const presolve_info_t<i_t, f_t>& presolve,
                    const simplex_solver_settings_t<i_t, f_t>& settings);
-  lp_status_t solve(f_t start_time,
-                    const barrier_solver_settings_t<i_t, f_t>& options,
-                    lp_solution_t<i_t, f_t>& solution);
+  lp_status_t solve(f_t start_time, lp_solution_t<i_t, f_t>& solution);
 
  private:
   void my_pop_range(bool debug) const;
   void create_Q(const lp_problem_t<i_t, f_t>& lp, csc_matrix_t<i_t, f_t>& Q);
   int initial_point(iteration_data_t<i_t, f_t>& data);
-
   void compute_residual_norms(const dense_vector_t<i_t, f_t>& w,
                               const dense_vector_t<i_t, f_t>& x,
                               const dense_vector_t<i_t, f_t>& y,
@@ -113,8 +104,7 @@ class barrier_solver_t {
                                    f_t& max_residual);
 
  private:
-  lp_status_t check_for_suboptimal_solution(const barrier_solver_settings_t<i_t, f_t>& options,
-                                            iteration_data_t<i_t, f_t>& data,
+  lp_status_t check_for_suboptimal_solution(iteration_data_t<i_t, f_t>& data,
                                             f_t start_time,
                                             i_t iter,
                                             f_t& primal_objective,

@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -16,8 +16,18 @@
 #include <utilities/copy_helpers.hpp>
 #include <utilities/cuda_helpers.cuh>
 
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sort.h>
+#include <thrust/tabulate.h>
+#include <thrust/tuple.h>
+
 namespace cuopt::linear_programming::dual_simplex {
 
+template <typename IndexType, typename ValueType>
+class device_csr_matrix_t;
+
 template <typename f_t>
 struct sum_reduce_helper_t {
   rmm::device_buffer buffer_data;
@@ -158,6 +168,9 @@ class device_csc_matrix_t {
     raft::copy(x.data(), A.x.data(), A.x.size(), stream);
   }
 
+  /** Same semantics as csc_matrix_t::to_compressed_row, entirely on device. */
+  void to_compressed_row(device_csr_matrix_t<i_t, f_t>& Arow, rmm::cuda_stream_view stream) const;
+
   void form_col_index(rmm::cuda_stream_view stream)
   {
     col_index.resize(x.size(), stream);
@@ -293,4 +306,82 @@ class device_csr_matrix_t {
                                          // to avoid extra space / computation)
 };
 
+template <typename i_t, typename f_t>
+void device_csc_matrix_t<i_t, f_t>::to_compressed_row(device_csr_matrix_t<i_t, f_t>& Arow,
+                                                      rmm::cuda_stream_view stream) const
+{
+  static_assert(std::is_signed_v<i_t>);
+  i_t const mm = m;
+  i_t const nn = n;
+  i_t const nz = nz_max;
+
+  Arow.m      = mm;
+  Arow.n      = nn;
+  Arow.nz_max = nz;
+  Arow.row_start.resize(mm + 1, stream);
+  Arow.j.resize(nz, stream);
+  Arow.x.resize(nz, stream);
+
+  auto exec = rmm::exec_policy(stream);
+
+  if (nz == 0) {
+    RAFT_CUDA_TRY(cudaMemsetAsync(Arow.row_start.data(), 0, sizeof(i_t) * (mm + 1), stream));
+    return;
+  }
+
+  rmm::device_uvector<i_t> row_counts(mm, stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(row_counts.data(), 0, sizeof(i_t) * mm, stream));
+
+  thrust::for_each(exec,
+                   thrust::make_counting_iterator<i_t>(0),
+                   thrust::make_counting_iterator<i_t>(nz),
+                   [row_ind = i.data(), counts = row_counts.data()] __device__(i_t p) {
+                     atomicAdd(counts + row_ind[p], i_t(1));
+                   });
+
+  rmm::device_buffer scan_tmp;
+  std::size_t scan_bytes = 0;
+  cub::DeviceScan::ExclusiveSum(
+    nullptr, scan_bytes, row_counts.data(), Arow.row_start.data(), mm, stream);
+  scan_tmp.resize(scan_bytes, stream);
+  cub::DeviceScan::ExclusiveSum(
+    scan_tmp.data(), scan_bytes, row_counts.data(), Arow.row_start.data(), mm, stream);
+
+  RAFT_CUDA_TRY(
+    cudaMemcpyAsync(Arow.row_start.data() + mm, &nz, sizeof(i_t), cudaMemcpyHostToDevice, stream));
+
+  rmm::device_uvector<i_t> rows(nz, stream);
+  rmm::device_uvector<i_t> cols(nz, stream);
+  rmm::device_uvector<f_t> vals(nz, stream);
+  raft::copy(rows.data(), i.data(), nz, stream);
+  raft::copy(vals.data(), x.data(), nz, stream);
+
+  thrust::tabulate(exec,
+                   thrust::device_pointer_cast(cols.data()),
+                   thrust::device_pointer_cast(cols.data() + nz),
+                   [cs = col_start.data(), nn_c = nn] __device__(i_t p) {
+                     i_t lo = 0;
+                     i_t hi = nn_c;
+                     while (lo < hi) {
+                       i_t mid = lo + (hi - lo) / 2;
+                       if (cs[mid] <= p) {
+                         lo = mid + 1;
+                       } else {
+                         hi = mid;
+                       }
+                     }
+                     return lo - 1;
+                   });
+
+  auto row_iter = thrust::device_pointer_cast(rows.data());
+  auto col_iter = thrust::device_pointer_cast(cols.data());
+  thrust::sort_by_key(exec,
+                      thrust::make_zip_iterator(thrust::make_tuple(row_iter, col_iter)),
+                      thrust::make_zip_iterator(thrust::make_tuple(row_iter + nz, col_iter + nz)),
+                      thrust::device_pointer_cast(vals.data()));
+
+  raft::copy(Arow.j.data(), cols.data(), nz, stream);
+  raft::copy(Arow.x.data(), vals.data(), nz, stream);
+}
+
 }  // namespace cuopt::linear_programming::dual_simplex
@@ -178,6 +178,7 @@ f_t iterative_refinement_gmres(T& op,
 
   bool show_info = false;
 
+  f_t stop_ratio = 5.0;
   f_t bnorm      = std::max(1.0, vector_norm_inf<f_t>(b));
   f_t rel_res    = 1.0;
   int outer_iter = 0;
@@ -361,10 +362,21 @@ f_t iterative_refinement_gmres(T& op,
                      l2_residual);
     }
 
+    f_t improvement_ratio = best_residual / residual;
     // Track best solution
-    if (residual < best_residual) {
+    if (improvement_ratio >= stop_ratio) {
       best_residual = residual;
       raft::copy(x_sav.data(), x.data(), x.size(), x.stream());
+    } else if (improvement_ratio < stop_ratio && improvement_ratio > 1.0) {
+      best_residual = residual;
+      raft::copy(x_sav.data(), x.data(), x.size(), x.stream());
+      // Residual decreased, but not enough, continue
+      if (show_info) {
+        CUOPT_LOG_INFO("GMRES IR: improvement ratio %e is less than %e, breaking early",
+                       improvement_ratio,
+                       stop_ratio);
+      }
+      break;
     } else {
       // Residual increased or stagnated, restore best and stop
       if (show_info) {

@@ -243,9 +243,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
     auto cudss_device_count = 1;
     CUDSS_CALL_AND_CHECK_EXIT(
       cudssCreateMg(&handle, cudss_device_count, &cudss_device_idx), status, "cudssCreateMg");
-
     CUDSS_CALL_AND_CHECK_EXIT(cudssSetStream(handle, stream), status, "cudaStreamCreate");
-
     mem_handler.ctx          = reinterpret_cast<void*>(handle_ptr_->get_workspace_resource());
     mem_handler.device_alloc = cudss_device_alloc<void>;
     mem_handler.device_free  = cudss_device_dealloc<void>;

@@ -13,6 +13,7 @@
 #include <dual_simplex/solve.hpp>
 #include <dual_simplex/tic_toc.hpp>
 
+#include <algorithm>
 #include <cmath>
 #include <iostream>
 #include <numeric>
@@ -828,7 +829,6 @@ i_t presolve(const lp_problem_t<i_t, f_t>& original,
   }
 
   if (settings.barrier_presolve && free_variables > 0) {
-    // Try to remove free variables
     std::vector<i_t> constraints_to_check;
     std::vector<i_t> current_free_variables;
     std::vector<i_t> row_marked(problem.num_rows, 0);
@@ -850,8 +850,7 @@ i_t presolve(const lp_problem_t<i_t, f_t>& original,
     }
 
     i_t removed_free_variables = 0;
-
-    if (constraints_to_check.size() > 0) {
+    if (!constraints_to_check.empty()) {
       // Check if the constraints are feasible
       csr_matrix_t<i_t, f_t> Arow(0, 0, 0);
       problem.A.to_compressed_row(Arow);
@@ -973,15 +972,14 @@ i_t presolve(const lp_problem_t<i_t, f_t>& original,
       }
     }
 
-    i_t new_free_variables = 0;
+    free_variables = 0;
     for (i_t j = 0; j < problem.num_cols; j++) {
-      if (problem.lower[j] == -inf && problem.upper[j] == inf) { new_free_variables++; }
+      if (problem.lower[j] == -inf && problem.upper[j] == inf) { free_variables++; }
     }
     if (removed_free_variables != 0) {
-      settings.log.printf("Bounded %d free variables\n", removed_free_variables);
+      settings.log.printf("Bounded %d free variable row(s) in presolve\n",
+                          static_cast<int>(removed_free_variables));
     }
-    assert(new_free_variables == free_variables - removed_free_variables);
-    free_variables = new_free_variables;
   }
 
   // The original problem may have a variable without a lower bound
@@ -1139,7 +1137,18 @@ i_t presolve(const lp_problem_t<i_t, f_t>& original,
 
   problem.Q.check_matrix("Before free variable expansion");
 
-  if (settings.barrier_presolve && free_variables > 0) {
+  // For QPs, keep free variables as-is rather than
+  // splitting x = v - w. The barrier solver handles them natively with a
+  // static regularizer on the diagonal instead of z/x complementarity terms.
+  if (settings.barrier_presolve && free_variables > 0 && problem.Q.n > 0) {
+    presolve_info.free_variable_indices.clear();
+    for (i_t j = 0; j < problem.num_cols; j++) {
+      if (problem.lower[j] == -inf && problem.upper[j] == inf) {
+        presolve_info.free_variable_indices.push_back(j);
+      }
+    }
+    settings.log.printf("Keeping %d free variables for QP augmented system\n", free_variables);
+  } else if (settings.barrier_presolve && free_variables > 0) {
     // We have a variable x_j: with -inf < x_j < inf
     // we create new variables v and w with 0 <= v, w and x_j = v - w
     // Constraints

@@ -153,6 +153,9 @@ struct presolve_info_t {
 
   // Variables that were negated to handle -inf < x_j <= u_j
   std::vector<i_t> negated_variables;
+
+  // Free variable indices for QP augmented system (not split, handled natively)
+  std::vector<i_t> free_variable_indices;
 };
 
 template <typename i_t, typename f_t>

@@ -84,6 +84,8 @@ struct simplex_solver_settings_t {
       deterministic(false),
       barrier(false),
       eliminate_dense_columns(true),
+      barrier_iterative_refinement(true),
+      barrier_step_scale(0.9),
       num_gpus(1),
       folding(-1),
       augmented(0),
@@ -164,7 +166,9 @@ struct simplex_solver_settings_t {
   bool cudss_deterministic;   // true to use cuDSS deterministic mode, false for non-deterministic
   bool barrier;               // true to use barrier method, false to use dual simplex method
   bool deterministic;  // true to use B&B deterministic mode, false to use non-deterministic mode
-  bool eliminate_dense_columns;  // true to eliminate dense columns from A*D*A^T
+  bool eliminate_dense_columns;       // true to eliminate dense columns from A*D*A^T
+  bool barrier_iterative_refinement;  // true to use iterative refinement for barrier method
+  f_t barrier_step_scale;             // step scale for barrier method
   int num_gpus;   // Number of GPUs to use (maximum of 2 gpus are supported at the moment)
   i_t folding;    // -1 automatic, 0 don't fold, 1 fold
   i_t augmented;  // -1 automatic, 0 to solve with ADAT, 1 to solve with augmented system

@@ -373,27 +373,8 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t<i_t, f_t>& us
   // Solve using barrier
   lp_solution_t<i_t, f_t> barrier_solution(barrier_lp.num_rows, barrier_lp.num_cols);
 
-  // Clear variable pairs for QP
-  if (barrier_lp.Q.n > 0) {
-    const i_t num_free_variables = presolve_info.free_variable_pairs.size() / 2;
-    for (i_t k = 0; k < num_free_variables; k++) {
-      i_t u = presolve_info.free_variable_pairs[2 * k];
-      i_t v = presolve_info.free_variable_pairs[2 * k + 1];
-
-      const i_t row_start_u = barrier_lp.Q.row_start[u];
-      const i_t row_end_u   = barrier_lp.Q.row_start[u + 1];
-      const i_t row_start_v = barrier_lp.Q.row_start[v];
-      const i_t row_end_v   = barrier_lp.Q.row_start[v + 1];
-      if (row_end_u - row_start_u == 0 && row_end_v - row_start_v == 0) {
-        settings.log.printf("Free variable pair %d-%d has no quadratic term\n", u, v);
-      }
-    }
-  }
-
   barrier_solver_t<i_t, f_t> barrier_solver(barrier_lp, presolve_info, barrier_settings);
-  barrier_solver_settings_t<i_t, f_t> barrier_solver_settings;
-  lp_status_t barrier_status =
-    barrier_solver.solve(start_time, barrier_solver_settings, barrier_solution);
+  lp_status_t barrier_status = barrier_solver.solve(start_time, barrier_solution);
   if (barrier_status == lp_status_t::OPTIMAL) {
 #ifdef COMPUTE_SCALED_RESIDUALS
     std::vector<f_t> scaled_residual = barrier_lp.rhs;

@@ -102,6 +102,7 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
     {CUOPT_DUAL_INFEASIBLE_TOLERANCE, &pdlp_settings.tolerances.dual_infeasible_tolerance, f_t(0.0), f_t(1e-1), std::max(f_t(1e-10), std::numeric_limits<f_t>::epsilon())},
     {CUOPT_MIP_CUT_CHANGE_THRESHOLD, &mip_settings.cut_change_threshold, f_t(-1.0), std::numeric_limits<f_t>::infinity(), f_t(-1.0)},
     {CUOPT_MIP_CUT_MIN_ORTHOGONALITY, &mip_settings.cut_min_orthogonality, f_t(0.0), f_t(1.0), f_t(0.5)},
+    {CUOPT_BARRIER_STEP_SCALE, &pdlp_settings.barrier_step_scale, f_t(0.5), f_t(1.0), f_t(0.9)},
     // MIP heuristic hyper-parameters (hidden from default --help: name contains "hyper_")
     {CUOPT_MIP_HYPER_HEURISTIC_PRESOLVE_TIME_RATIO, &mip_settings.heuristic_params.presolve_time_ratio, f_t(0.0), f_t(1.0), f_t(0.1), "fraction of total time for presolve"},
     {CUOPT_MIP_HYPER_HEURISTIC_PRESOLVE_MAX_TIME, &mip_settings.heuristic_params.presolve_max_time, f_t(0.0), std::numeric_limits<f_t>::infinity(), f_t(60.0), "hard cap on presolve seconds"},
@@ -172,6 +173,7 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
     {CUOPT_ELIMINATE_DENSE_COLUMNS, &pdlp_settings.eliminate_dense_columns, true},
     {CUOPT_CUDSS_DETERMINISTIC, &pdlp_settings.cudss_deterministic, false},
     {CUOPT_DUAL_POSTSOLVE, &pdlp_settings.dual_postsolve, true},
+    {CUOPT_BARRIER_ITERATIVE_REFINEMENT, &pdlp_settings.barrier_iterative_refinement, true},
   };
   // String parameters
   string_parameters = {