-
Notifications
You must be signed in to change notification settings - Fork 171
Reduce QP overheads #1140
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Reduce QP overheads #1140
Changes from all commits
588d5c5
cd41420
020312b
104b82c
0002029
f6be237
7b8c2ec
0c59dd3
d4bfc9f
8ea77bf
f340ca2
a80f902
add1a01
47ddf84
2f5db64
ebb9358
1b9078f
fe14d1c
c779359
a1ab6b9
256c394
b02aedf
0bb1aed
d806a5a
fcc6652
3b56eff
464f768
6c1cc72
52bd299
c0e2542
ab709dc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,6 @@ | ||
| /* clang-format off */ | ||
| /* | ||
| * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| * SPDX-License-Identifier: Apache-2.0 | ||
| */ | ||
| /* clang-format on */ | ||
|
|
@@ -16,8 +16,18 @@ | |
| #include <utilities/copy_helpers.hpp> | ||
| #include <utilities/cuda_helpers.cuh> | ||
|
|
||
| #include <thrust/device_ptr.h> | ||
| #include <thrust/iterator/counting_iterator.h> | ||
| #include <thrust/iterator/zip_iterator.h> | ||
| #include <thrust/sort.h> | ||
| #include <thrust/tabulate.h> | ||
| #include <thrust/tuple.h> | ||
|
|
||
| namespace cuopt::linear_programming::dual_simplex { | ||
|
|
||
| template <typename IndexType, typename ValueType> | ||
| class device_csr_matrix_t; | ||
|
|
||
| template <typename f_t> | ||
| struct sum_reduce_helper_t { | ||
| rmm::device_buffer buffer_data; | ||
|
|
@@ -158,6 +168,9 @@ class device_csc_matrix_t { | |
| raft::copy(x.data(), A.x.data(), A.x.size(), stream); | ||
| } | ||
|
|
||
| /** Same semantics as csc_matrix_t::to_compressed_row, entirely on device. */ | ||
| void to_compressed_row(device_csr_matrix_t<i_t, f_t>& Arow, rmm::cuda_stream_view stream) const; | ||
|
|
||
| void form_col_index(rmm::cuda_stream_view stream) | ||
| { | ||
| col_index.resize(x.size(), stream); | ||
|
|
@@ -293,4 +306,82 @@ class device_csr_matrix_t { | |
| // to avoid extra space / computation) | ||
| }; | ||
|
|
||
| template <typename i_t, typename f_t> | ||
| void device_csc_matrix_t<i_t, f_t>::to_compressed_row(device_csr_matrix_t<i_t, f_t>& Arow, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is really cool! |
||
| rmm::cuda_stream_view stream) const | ||
| { | ||
| static_assert(std::is_signed_v<i_t>); | ||
| i_t const mm = m; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: why can't you just use m and n?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because you want these to be const? |
||
| i_t const nn = n; | ||
| i_t const nz = nz_max; | ||
|
|
||
| Arow.m = mm; | ||
| Arow.n = nn; | ||
| Arow.nz_max = nz; | ||
| Arow.row_start.resize(mm + 1, stream); | ||
| Arow.j.resize(nz, stream); | ||
| Arow.x.resize(nz, stream); | ||
|
|
||
| auto exec = rmm::exec_policy(stream); | ||
|
|
||
| if (nz == 0) { | ||
| RAFT_CUDA_TRY(cudaMemsetAsync(Arow.row_start.data(), 0, sizeof(i_t) * (mm + 1), stream)); | ||
| return; | ||
| } | ||
|
|
||
| rmm::device_uvector<i_t> row_counts(mm, stream); | ||
| RAFT_CUDA_TRY(cudaMemsetAsync(row_counts.data(), 0, sizeof(i_t) * mm, stream)); | ||
|
|
||
| thrust::for_each(exec, | ||
| thrust::make_counting_iterator<i_t>(0), | ||
| thrust::make_counting_iterator<i_t>(nz), | ||
| [row_ind = i.data(), counts = row_counts.data()] __device__(i_t p) { | ||
| atomicAdd(counts + row_ind[p], i_t(1)); | ||
| }); | ||
|
|
||
| rmm::device_buffer scan_tmp; | ||
| std::size_t scan_bytes = 0; | ||
| cub::DeviceScan::ExclusiveSum( | ||
| nullptr, scan_bytes, row_counts.data(), Arow.row_start.data(), mm, stream); | ||
| scan_tmp.resize(scan_bytes, stream); | ||
| cub::DeviceScan::ExclusiveSum( | ||
| scan_tmp.data(), scan_bytes, row_counts.data(), Arow.row_start.data(), mm, stream); | ||
|
|
||
| RAFT_CUDA_TRY( | ||
| cudaMemcpyAsync(Arow.row_start.data() + mm, &nz, sizeof(i_t), cudaMemcpyHostToDevice, stream)); | ||
|
|
||
| rmm::device_uvector<i_t> rows(nz, stream); | ||
| rmm::device_uvector<i_t> cols(nz, stream); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is cols needed? Why not directly use Arow.x? You could always make cols a reference to Arow.x if you wanted to keep the name cols. But it seems like you need an extra copy at the end. |
||
| rmm::device_uvector<f_t> vals(nz, stream); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is vals needed? Why not directly use Arow.x? |
||
| raft::copy(rows.data(), i.data(), nz, stream); | ||
| raft::copy(vals.data(), x.data(), nz, stream); | ||
|
|
||
| thrust::tabulate(exec, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is using a binary search to find out which a column the nonzero stored at index p is goes in?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh you are expanding to back to triplet? |
||
| thrust::device_pointer_cast(cols.data()), | ||
| thrust::device_pointer_cast(cols.data() + nz), | ||
| [cs = col_start.data(), nn_c = nn] __device__(i_t p) { | ||
| i_t lo = 0; | ||
| i_t hi = nn_c; | ||
| while (lo < hi) { | ||
| i_t mid = lo + (hi - lo) / 2; | ||
| if (cs[mid] <= p) { | ||
| lo = mid + 1; | ||
| } else { | ||
| hi = mid; | ||
| } | ||
| } | ||
| return lo - 1; | ||
| }); | ||
|
|
||
| auto row_iter = thrust::device_pointer_cast(rows.data()); | ||
| auto col_iter = thrust::device_pointer_cast(cols.data()); | ||
| thrust::sort_by_key(exec, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No issue with the code, but I'd love to understand how it works. You want to sort the nonzeros by row. I'm not sure I understand why the second iterator has row_iter + nz and col_iter + nz. |
||
| thrust::make_zip_iterator(thrust::make_tuple(row_iter, col_iter)), | ||
| thrust::make_zip_iterator(thrust::make_tuple(row_iter + nz, col_iter + nz)), | ||
| thrust::device_pointer_cast(vals.data())); | ||
|
|
||
| raft::copy(Arow.j.data(), cols.data(), nz, stream); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you can get rid of these two copies and extra allocations by using Arow.j and Arow.x directly. |
||
| raft::copy(Arow.x.data(), vals.data(), nz, stream); | ||
| } | ||
|
|
||
| } // namespace cuopt::linear_programming::dual_simplex | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,6 +13,7 @@ | |
| #include <dual_simplex/solve.hpp> | ||
| #include <dual_simplex/tic_toc.hpp> | ||
|
|
||
| #include <algorithm> | ||
| #include <cmath> | ||
| #include <iostream> | ||
| #include <numeric> | ||
|
|
@@ -828,7 +829,6 @@ i_t presolve(const lp_problem_t<i_t, f_t>& original, | |
| } | ||
|
|
||
| if (settings.barrier_presolve && free_variables > 0) { | ||
| // Try to remove free variables | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add back comment. |
||
| std::vector<i_t> constraints_to_check; | ||
| std::vector<i_t> current_free_variables; | ||
| std::vector<i_t> row_marked(problem.num_rows, 0); | ||
|
|
@@ -850,8 +850,7 @@ i_t presolve(const lp_problem_t<i_t, f_t>& original, | |
| } | ||
|
|
||
| i_t removed_free_variables = 0; | ||
|
|
||
| if (constraints_to_check.size() > 0) { | ||
| if (!constraints_to_check.empty()) { | ||
| // Check if the constraints are feasible | ||
| csr_matrix_t<i_t, f_t> Arow(0, 0, 0); | ||
| problem.A.to_compressed_row(Arow); | ||
|
|
@@ -973,15 +972,14 @@ i_t presolve(const lp_problem_t<i_t, f_t>& original, | |
| } | ||
| } | ||
|
|
||
| i_t new_free_variables = 0; | ||
| free_variables = 0; | ||
| for (i_t j = 0; j < problem.num_cols; j++) { | ||
| if (problem.lower[j] == -inf && problem.upper[j] == inf) { new_free_variables++; } | ||
| if (problem.lower[j] == -inf && problem.upper[j] == inf) { free_variables++; } | ||
| } | ||
| if (removed_free_variables != 0) { | ||
| settings.log.printf("Bounded %d free variables\n", removed_free_variables); | ||
| settings.log.printf("Bounded %d free variable row(s) in presolve\n", | ||
| static_cast<int>(removed_free_variables)); | ||
| } | ||
| assert(new_free_variables == free_variables - removed_free_variables); | ||
| free_variables = new_free_variables; | ||
| } | ||
|
|
||
| // The original problem may have a variable without a lower bound | ||
|
|
@@ -1139,7 +1137,18 @@ i_t presolve(const lp_problem_t<i_t, f_t>& original, | |
|
|
||
| problem.Q.check_matrix("Before free variable expansion"); | ||
|
|
||
| if (settings.barrier_presolve && free_variables > 0) { | ||
| // For QPs, keep free variables as-is rather than | ||
| // splitting x = v - w. The barrier solver handles them natively with a | ||
| // static regularizer on the diagonal instead of z/x complementarity terms. | ||
| if (settings.barrier_presolve && free_variables > 0 && problem.Q.n > 0) { | ||
| presolve_info.free_variable_indices.clear(); | ||
| for (i_t j = 0; j < problem.num_cols; j++) { | ||
| if (problem.lower[j] == -inf && problem.upper[j] == inf) { | ||
| presolve_info.free_variable_indices.push_back(j); | ||
| } | ||
| } | ||
| settings.log.printf("Keeping %d free variables for QP augmented system\n", free_variables); | ||
| } else if (settings.barrier_presolve && free_variables > 0) { | ||
| // We have a variable x_j: with -inf < x_j < inf | ||
| // we create new variables v and w with 0 <= v, w and x_j = v - w | ||
| // Constraints | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -102,6 +102,7 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings | |
| {CUOPT_DUAL_INFEASIBLE_TOLERANCE, &pdlp_settings.tolerances.dual_infeasible_tolerance, f_t(0.0), f_t(1e-1), std::max(f_t(1e-10), std::numeric_limits<f_t>::epsilon())}, | ||
| {CUOPT_MIP_CUT_CHANGE_THRESHOLD, &mip_settings.cut_change_threshold, f_t(-1.0), std::numeric_limits<f_t>::infinity(), f_t(-1.0)}, | ||
| {CUOPT_MIP_CUT_MIN_ORTHOGONALITY, &mip_settings.cut_min_orthogonality, f_t(0.0), f_t(1.0), f_t(0.5)}, | ||
| {CUOPT_BARRIER_STEP_SCALE, &pdlp_settings.barrier_step_scale, f_t(0.5), f_t(1.0), f_t(0.9)}, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can't ever go to 1.0. The max needs to be something like 0.9999 |
||
| // MIP heuristic hyper-parameters (hidden from default --help: name contains "hyper_") | ||
| {CUOPT_MIP_HYPER_HEURISTIC_PRESOLVE_TIME_RATIO, &mip_settings.heuristic_params.presolve_time_ratio, f_t(0.0), f_t(1.0), f_t(0.1), "fraction of total time for presolve"}, | ||
| {CUOPT_MIP_HYPER_HEURISTIC_PRESOLVE_MAX_TIME, &mip_settings.heuristic_params.presolve_max_time, f_t(0.0), std::numeric_limits<f_t>::infinity(), f_t(60.0), "hard cap on presolve seconds"}, | ||
|
|
@@ -172,6 +173,7 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings | |
| {CUOPT_ELIMINATE_DENSE_COLUMNS, &pdlp_settings.eliminate_dense_columns, true}, | ||
| {CUOPT_CUDSS_DETERMINISTIC, &pdlp_settings.cudss_deterministic, false}, | ||
| {CUOPT_DUAL_POSTSOLVE, &pdlp_settings.dual_postsolve, true}, | ||
| {CUOPT_BARRIER_ITERATIVE_REFINEMENT, &pdlp_settings.barrier_iterative_refinement, true}, | ||
| }; | ||
| // String parameters | ||
| string_parameters = { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: could you put these next to CUOPT_BARRIER_DUAL_INITIAL_POINT above. So we keep all the barrier parameters together?