Skip to content

Instantly share code, notes, and snippets.

@sriramch
Created May 3, 2019 15:17
Show Gist options
  • Select an option

  • Save sriramch/7d2494f6b2cc37ae57da6f0aa20da7cf to your computer and use it in GitHub Desktop.

Select an option

Save sriramch/7d2494f6b2cc37ae57da6f0aa20da7cf to your computer and use it in GitHub Desktop.
training patch with perf improvements
diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu
index b4cc797..1981264 100644
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -96,9 +96,8 @@ struct GPUSketcher {
bool has_weights_{false};
tree::TrainParam param_;
- std::vector<WXQSketch> sketches_;
+ SketchContainer &sketch_container_;
thrust::device_vector<size_t> row_ptrs_;
- std::vector<WXQSketch::SummaryContainer> summaries_;
thrust::device_vector<Entry> entries_;
thrust::device_vector<bst_float> fvalues_;
thrust::device_vector<bst_float> feature_weights_;
@@ -113,9 +112,9 @@ struct GPUSketcher {
public:
DeviceShard(int device, bst_uint row_begin, bst_uint row_end,
- tree::TrainParam param) :
+ tree::TrainParam param, SketchContainer &sketch_container) :
device_(device), row_begin_(row_begin), row_end_(row_end),
- n_rows_(row_end - row_begin), param_(std::move(param)) {
+ n_rows_(row_end - row_begin), param_(std::move(param)), sketch_container_(sketch_container) {
}
void Init(const SparsePage& row_batch, const MetaInfo& info, int gpu_batch_nrows) {
@@ -136,20 +135,10 @@ struct GPUSketcher {
gpu_batch_nrows_ = n_rows_;
}
- // initialize sketches
- sketches_.resize(num_cols_);
- summaries_.resize(num_cols_);
constexpr int kFactor = 8;
- double eps = 1.0 / (kFactor * param_.max_bin);
+ double eps = 1.0 / (kFactor * param_.max_bin);
size_t dummy_nlevel;
- WXQSketch::LimitSizeLevel(row_batch.Size(), eps, &dummy_nlevel, &n_cuts_);
- // double ncuts to be the same as the number of values
- // in the temporary buffers of the sketches
- n_cuts_ *= 2;
- for (int icol = 0; icol < num_cols_; ++icol) {
- sketches_[icol].Init(row_batch.Size(), eps);
- summaries_[icol].Reserve(n_cuts_);
- }
+ WXQSketch::LimitSizeLevel(gpu_batch_nrows_, eps, &dummy_nlevel, &n_cuts_);
// allocate necessary GPU buffers
dh::safe_cuda(cudaSetDevice(device_));
@@ -306,9 +295,12 @@ struct GPUSketcher {
// unpack the features; also unpack weights if present
thrust::fill(fvalues_.begin(), fvalues_.end(), NAN);
- thrust::fill(feature_weights_.begin(), feature_weights_.end(), NAN);
+ if (has_weights_) {
+ thrust::fill(feature_weights_.begin(), feature_weights_.end(), NAN);
+ }
- dim3 block3(64, 4, 1);
+ dim3 block3(16, 64, 1);
+ // NOTE: This will typically support ~ 4M features - 64K*64
dim3 grid3(dh::DivRoundUp(batch_nrows, block3.x),
dh::DivRoundUp(num_cols_, block3.y), 1);
UnpackFeaturesK<<<grid3, block3>>>
@@ -324,9 +316,15 @@ struct GPUSketcher {
// add cuts into sketches
thrust::copy(cuts_d_.begin(), cuts_d_.end(), cuts_h_.begin());
+#pragma omp parallel for schedule(static)
for (int icol = 0; icol < num_cols_; ++icol) {
- summaries_[icol].MakeFromSorted(&cuts_h_[n_cuts_ * icol], n_cuts_cur_[icol]);
- sketches_[icol].PushSummary(summaries_[icol]);
+ WXQSketch::SummaryContainer summary;
+ summary.Reserve(n_cuts_);
+ summary.MakeFromSorted(&cuts_h_[n_cuts_ * icol], n_cuts_cur_[icol]);
+ // Spin in user land until we we can grab this lock
+ while (sketch_container_.col_locks_[icol]->test_and_set());
+ sketch_container_.sketches_[icol].PushSummary(summary);
+ sketch_container_.col_locks_[icol]->clear();
}
}
@@ -342,14 +340,10 @@ struct GPUSketcher {
SketchBatch(row_batch, info, gpu_batch);
}
}
-
- void GetSummary(WXQSketch::SummaryContainer *summary, size_t const icol) {
- sketches_[icol].GetSummary(summary);
- }
};
void Sketch(const SparsePage& batch, const MetaInfo& info,
- HistCutMatrix* hmat, int gpu_batch_nrows) {
+ SketchContainer &sketch_container, int gpu_batch_nrows) {
// create device shards
shards_.resize(dist_.Devices().Size());
dh::ExecuteIndexShards(&shards_, [&](int i, std::unique_ptr<DeviceShard>& shard) {
@@ -357,7 +351,7 @@ struct GPUSketcher {
size_t size = dist_.ShardSize(info.num_row_, i);
shard = std::unique_ptr<DeviceShard>(
new DeviceShard(dist_.Devices().DeviceId(i),
- start, start + size, param_));
+ start, start + size, param_, sketch_container));
});
// compute sketches for each shard
@@ -366,21 +360,6 @@ struct GPUSketcher {
shard->Init(batch, info, gpu_batch_nrows);
shard->Sketch(batch, info);
});
-
- // merge the sketches from all shards
- // TODO(canonizer): do it in a tree-like reduction
- int num_cols = info.num_col_;
- std::vector<WXQSketch> sketches(num_cols);
- WXQSketch::SummaryContainer summary;
- for (int icol = 0; icol < num_cols; ++icol) {
- sketches[icol].Init(batch.Size(), 1.0 / (8 * param_.max_bin));
- for (auto &shard : shards_) {
- shard->GetSummary(&summary, icol);
- sketches[icol].PushSummary(summary);
- }
- }
-
- hmat->Init(&sketches, param_.max_bin);
}
GPUSketcher(tree::TrainParam param, size_t n_rows) : param_(std::move(param)) {
@@ -395,9 +374,10 @@ struct GPUSketcher {
void DeviceSketch
(const SparsePage& batch, const MetaInfo& info,
- const tree::TrainParam& param, HistCutMatrix* hmat, int gpu_batch_nrows) {
+ const tree::TrainParam& param, SketchContainer &sketch_container,
+ int gpu_batch_nrows) {
GPUSketcher sketcher(param, info.num_row_);
- sketcher.Sketch(batch, info, hmat, gpu_batch_nrows);
+ sketcher.Sketch(batch, info, sketch_container, gpu_batch_nrows);
}
} // namespace common
diff --git a/src/common/hist_util.h b/src/common/hist_util.h
index 3aa2f23..2d0d10a 100644
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -47,10 +47,24 @@ struct HistCutMatrix {
Monitor monitor_;
};
+
+/*!
+ * \brief A container that holds the device sketches across all
+ * sparse page batches which are distributed to different devices.
+ * As sketches are aggregated by column, the atomic flag simulates
+ * a lock in user land when multiple devices could be pushing
+ * sketch summary for the same column across distinct rows.
+ */
+struct SketchContainer {
+ std::vector<HistCutMatrix::WXQSketch> sketches_;
+ std::vector<std::unique_ptr<std::atomic_flag>> col_locks_;
+};
+
/*! \brief Builds the cut matrix on the GPU */
void DeviceSketch
(const SparsePage& batch, const MetaInfo& info,
- const tree::TrainParam& param, HistCutMatrix* hmat, int gpu_batch_nrows);
+ const tree::TrainParam& param, SketchContainer &sketches,
+ int gpu_batch_nrows);
/*!
* \brief A single row in global histogram index.
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index e93ca9a..573d17d 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -164,8 +164,10 @@ struct ELLPackMatrix {
}
void Init(common::Span<uint32_t> feature_segments,
common::Span<bst_float> min_fvalue,
- common::Span<bst_float> gidx_fvalue_map, size_t row_stride,
- common::CompressedIterator<uint32_t> gidx_iter, bool is_dense,
+ common::Span<bst_float> gidx_fvalue_map,
+ size_t row_stride,
+ common::CompressedIterator<uint32_t> gidx_iter,
+ bool is_dense,
int null_gidx_value) {
this->feature_segments = feature_segments;
this->min_fvalue = min_fvalue;
@@ -646,8 +648,6 @@ struct DeviceShard {
/*! \brief Sum gradient for each node. */
std::vector<GradientPair> node_sum_gradients;
common::Span<GradientPair> node_sum_gradients_d;
- /*! \brief row offset in SparsePage (the input data). */
- thrust::device_vector<size_t> row_ptrs;
/*! \brief On-device feature set, only actually used on one of the devices */
thrust::device_vector<int> feature_set_d;
thrust::device_vector<int64_t>
@@ -656,6 +656,7 @@ struct DeviceShard {
bst_uint row_begin_idx;
bst_uint row_end_idx;
bst_uint n_rows;
+ bst_uint rows_processed;
TrainParam param;
bool prediction_cache_initialised;
@@ -674,36 +675,17 @@ struct DeviceShard {
row_begin_idx(row_begin),
row_end_idx(row_end),
n_rows(row_end - row_begin),
+ rows_processed(0),
n_bins(0),
param(std::move(_param)),
prediction_cache_initialised(false) {}
- /* Init row_ptrs and row_stride */
- size_t InitRowPtrs(const SparsePage& row_batch) {
- const auto& offset_vec = row_batch.offset.HostVector();
- row_ptrs.resize(n_rows + 1);
- thrust::copy(offset_vec.data() + row_begin_idx,
- offset_vec.data() + row_end_idx + 1,
- row_ptrs.begin());
- auto row_iter = row_ptrs.begin();
- // find the maximum row size for converting to ELLPack
- auto get_size = [=] __device__(size_t row) {
- return row_iter[row + 1] - row_iter[row];
- }; // NOLINT
-
- auto counting = thrust::make_counting_iterator(size_t(0));
- using TransformT = thrust::transform_iterator<decltype(get_size),
- decltype(counting), size_t>;
- TransformT row_size_iter = TransformT(counting, get_size);
- size_t row_stride = thrust::reduce(row_size_iter, row_size_iter + n_rows, 0,
- thrust::maximum<size_t>());
- return row_stride;
- }
-
void InitCompressedData(
- const common::HistCutMatrix& hmat, const SparsePage& row_batch, bool is_dense);
+ const common::HistCutMatrix& hmat, size_t row_stride, bool is_dense);
- void CreateHistIndices(const SparsePage& row_batch, size_t row_stride, int null_gidx_value);
+ void CreateHistIndices(
+ const SparsePage& row_batch, size_t shard_idx,
+ const std::vector<size_t> &shard_allocations, int null_gidx_value);
~DeviceShard() {
dh::safe_cuda(cudaSetDevice(device_id));
@@ -1045,11 +1027,13 @@ struct GlobalMemHistBuilder : public GPUHistBuilderBase<GradientSumT> {
template <typename GradientSumT>
inline void DeviceShard<GradientSumT>::InitCompressedData(
- const common::HistCutMatrix& hmat, const SparsePage& row_batch, bool is_dense) {
- size_t row_stride = this->InitRowPtrs(row_batch);
+ const common::HistCutMatrix& hmat, size_t row_stride, bool is_dense) {
n_bins = hmat.row_ptr.back();
int null_gidx_value = hmat.row_ptr.back();
+ CHECK(!(param.max_leaves == 0 && param.max_depth == 0))
+ << "Max leaves and max depth cannot both be unconstrained for "
+ "gpu_hist.";
int max_nodes =
param.max_leaves > 0 ? param.max_leaves * 2 : MaxNodesDepth(param.max_depth);
@@ -1072,24 +1056,17 @@ inline void DeviceShard<GradientSumT>::InitCompressedData(
node_sum_gradients.resize(max_nodes);
ridx_segments.resize(max_nodes);
-
// allocate compressed bin data
int num_symbols = n_bins + 1;
// Required buffer size for storing data matrix in ELLPack format.
size_t compressed_size_bytes =
common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows,
num_symbols);
-
- CHECK(!(param.max_leaves == 0 && param.max_depth == 0))
- << "Max leaves and max depth cannot both be unconstrained for "
- "gpu_hist.";
ba.Allocate(device_id, &gidx_buffer, compressed_size_bytes);
thrust::fill(
thrust::device_pointer_cast(gidx_buffer.data()),
thrust::device_pointer_cast(gidx_buffer.data() + gidx_buffer.size()), 0);
- this->CreateHistIndices(row_batch, row_stride, null_gidx_value);
-
ellpack_matrix.Init(
feature_segments, min_fvalue,
gidx_fvalue_map, row_stride,
@@ -1113,23 +1090,42 @@ inline void DeviceShard<GradientSumT>::InitCompressedData(
template <typename GradientSumT>
inline void DeviceShard<GradientSumT>::CreateHistIndices(
- const SparsePage& row_batch, size_t row_stride, int null_gidx_value) {
+ const SparsePage& row_batch,
+ size_t shard_idx,
+ const std::vector<size_t> &shard_allocations,
+ int null_gidx_value) {
+ // Has any been allocated for me in this batch?
+ size_t num_elems = shard_allocations[shard_idx];
+ if (!num_elems) return;
+
+ size_t row_stride = this->ellpack_matrix.row_stride;
+
+ // Take into account the rows that could be processed by other GPUs
+ size_t begin_idx(0);
+ for (size_t i = 0; i < shard_idx; ++i) begin_idx += shard_allocations[i];
+
+ const auto& offset_vec = row_batch.offset.HostVector();
+ /*! \brief row offset in SparsePage (the input data). */
+ thrust::device_vector<size_t> row_ptrs(num_elems+1);
+ thrust::copy(offset_vec.data() + begin_idx,
+ offset_vec.data() + begin_idx + num_elems + 1,
+ row_ptrs.begin());
+
int num_symbols = n_bins + 1;
// bin and compress entries in batches of rows
size_t gpu_batch_nrows =
std::min
(dh::TotalMemory(device_id) / (16 * row_stride * sizeof(Entry)),
- static_cast<size_t>(n_rows));
+ static_cast<size_t>(num_elems));
const std::vector<Entry>& data_vec = row_batch.data.HostVector();
thrust::device_vector<Entry> entries_d(gpu_batch_nrows * row_stride);
- size_t gpu_nbatches = dh::DivRoundUp(n_rows, gpu_batch_nrows);
-
+ size_t gpu_nbatches = dh::DivRoundUp(num_elems, gpu_batch_nrows);
for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) {
size_t batch_row_begin = gpu_batch * gpu_batch_nrows;
size_t batch_row_end = (gpu_batch + 1) * gpu_batch_nrows;
- if (batch_row_end > n_rows) {
- batch_row_end = n_rows;
+ if (batch_row_end > num_elems) {
+ batch_row_end = num_elems;
}
size_t batch_nrows = batch_row_end - batch_row_begin;
// number of entries in this batch.
@@ -1140,19 +1136,25 @@ inline void DeviceShard<GradientSumT>::CreateHistIndices(
(entries_d.data().get(), data_vec.data() + row_ptrs[batch_row_begin],
n_entries * sizeof(Entry), cudaMemcpyDefault));
const dim3 block3(32, 8, 1); // 256 threads
- const dim3 grid3(dh::DivRoundUp(n_rows, block3.x),
+ const dim3 grid3(dh::DivRoundUp(num_elems, block3.x),
dh::DivRoundUp(row_stride, block3.y), 1);
CompressBinEllpackKernel<<<grid3, block3>>>
(common::CompressedBufferWriter(num_symbols),
gidx_buffer.data(),
row_ptrs.data().get() + batch_row_begin,
entries_d.data().get(),
- gidx_fvalue_map.data(), feature_segments.data(),
- batch_row_begin, batch_nrows,
+ gidx_fvalue_map.data(),
+ feature_segments.data(),
+ rows_processed + batch_row_begin,
+ batch_nrows,
row_ptrs[batch_row_begin],
- row_stride, null_gidx_value);
+ row_stride,
+ null_gidx_value);
}
+ // This will be the offset into the gidx_buffer for the next batch
+ rows_processed += num_elems;
+
// free the memory that is no longer needed
row_ptrs.resize(0);
row_ptrs.shrink_to_fit();
@@ -1216,8 +1218,6 @@ class GPUHistMakerSpecialised{
reducer_.Init(device_list_);
- auto batch_iter = dmat->GetRowBatches().begin();
- const SparsePage& batch = *batch_iter;
// Create device shards
shards_.resize(n_devices);
dh::ExecuteIndexShards(
@@ -1231,23 +1231,99 @@ class GPUHistMakerSpecialised{
start + size, param_));
});
- // Find the cuts.
+ // Initialize Sketches across all batches
+ common::SketchContainer sketch_container;
+ sketch_container.sketches_.resize(info_->num_col_);
+ sketch_container.col_locks_.resize(info_->num_col_);
+#pragma omp parallel for schedule(static)
+ for (int icol = 0; icol < info_->num_col_; ++icol) {
+ sketch_container.sketches_[icol].Init(info_->num_row_, 1.0 / (8 * param_.max_bin));
+ sketch_container.col_locks_[icol].reset(new std::atomic_flag(ATOMIC_FLAG_INIT));
+ }
+
+ size_t row_stride(0); // Max row_stride across the entire dataset
+ size_t processed_size(0);
+
+ // Rows allocated to each GPU in a batch
+ std::vector<std::vector<size_t>> shard_allocations;
monitor_.StartCuda("Quantiles");
- common::DeviceSketch(batch, *info_, param_, &hmat_, hist_maker_param_.gpu_batch_nrows);
- n_bins_ = hmat_.row_ptr.back();
+ for (const auto &batch : dmat->GetRowBatches()) {
+ struct RowReset {
+ public:
+ RowReset(MetaInfo &minfo, const SparsePage &r_batch)
+ : minfo_(minfo) {
+ orig_nrows_ = minfo_.num_row_;
+ minfo.num_row_ = r_batch.Size();
+ }
+
+ ~RowReset() {
+ minfo_.num_row_ = orig_nrows_;
+ }
+ private:
+ MetaInfo &minfo_;
+ size_t orig_nrows_;
+ }rowgd(*info_, batch);
+
+ // Find the cuts for each batch
+ common::DeviceSketch(batch, *info_, param_, sketch_container, hist_maker_param_.gpu_batch_nrows);
+
+ const auto &offset_vec = batch.offset.HostVector();
+ for (size_t i = 1; i < offset_vec.size(); ++i) {
+ row_stride = std::max(row_stride, offset_vec[i] - offset_vec[i-1]);
+ }
+
+ std::vector<size_t> batch_allocation(shards_.size());
+ // Distribute the rows in this batch to the different shards
+ for (size_t i = 0; i < shards_.size(); ++i) {
+ // Does this batch pertain to me?
+ ssize_t num_elems = processed_size + batch.Size() - shards_[i]->row_begin_idx;
+ ssize_t rem_elems = shards_[i]->row_end_idx - processed_size;
+ if (num_elems <= 0 || rem_elems <= 0) batch_allocation[i] = 0;
+ // How many elements do I process from this batch?
+ else {
+ num_elems = std::min(
+ std::min(
+ std::min(num_elems, rem_elems),
+ static_cast<ssize_t>(shards_[i]->n_rows)),
+ static_cast<ssize_t>(batch.Size()));
+ batch_allocation[i] = num_elems;
+ }
+ }
+ shard_allocations.push_back(batch_allocation);
+
+ processed_size += batch.Size();
+ }
monitor_.StopCuda("Quantiles");
+
+ // Initialize hmat_ for the entire data set
+ hmat_.Init(&sketch_container.sketches_, param_.max_bin);
+
+ n_bins_ = hmat_.row_ptr.back();
+
auto is_dense = info_->num_nonzero_ == info_->num_row_ * info_->num_col_;
- monitor_.StartCuda("BinningCompression");
+ // Init global data for each shard
+ monitor_.StartCuda("InitCompressedData");
dh::ExecuteIndexShards(
&shards_,
- [&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
- dh::safe_cuda(cudaSetDevice(shard->device_id));
- shard->InitCompressedData(hmat_, batch, is_dense);
+ [&](int i, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
+ dh::safe_cuda(cudaSetDevice(dist_.Devices().DeviceId(i)));
+ shard->InitCompressedData(hmat_, row_stride, is_dense);
});
+ monitor_.StopCuda("InitCompressedData");
+
+ monitor_.StartCuda("BinningCompression");
+ size_t batch_idx(0);
+ for (const auto &batch : dmat->GetRowBatches()) {
+ dh::ExecuteIndexShards(
+ &shards_,
+ [&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
+ dh::safe_cuda(cudaSetDevice(shard->device_id));
+ shard->CreateHistIndices(batch, idx, shard_allocations[batch_idx], hmat_.row_ptr.back());
+ });
+ batch_idx++;
+ }
monitor_.StopCuda("BinningCompression");
- ++batch_iter;
- CHECK(batch_iter.AtEnd()) << "External memory not supported";
p_last_fmat_ = dmat;
initialised_ = true;
diff --git a/tests/cpp/common/test_gpu_hist_util.cu b/tests/cpp/common/test_gpu_hist_util.cu
index 7c4fbd7..350ec42 100644
--- a/tests/cpp/common/test_gpu_hist_util.cu
+++ b/tests/cpp/common/test_gpu_hist_util.cu
@@ -39,7 +39,16 @@ void TestDeviceSketch(const GPUSet& devices) {
// find the cuts on the GPU
const SparsePage& batch = *(*dmat)->GetRowBatches().begin();
HistCutMatrix hmat_gpu;
- DeviceSketch(batch, (*dmat)->Info(), p, &hmat_gpu, gpu_batch_nrows);
+ common::SketchContainer sketch_container;
+ sketch_container.sketches_.resize((*dmat)->Info().num_col_);
+ sketch_container.col_locks_.resize((*dmat)->Info().num_col_);
+ for (int icol = 0; icol < (*dmat)->Info().num_col_; ++icol) {
+ sketch_container.sketches_[icol].Init((*dmat)->Info().num_row_, 1.0 / (8 * p.max_bin));
+ sketch_container.col_locks_[icol].reset(new std::atomic_flag(ATOMIC_FLAG_INIT));
+ }
+
+ DeviceSketch(batch, (*dmat)->Info(), p, sketch_container, gpu_batch_nrows);
+ hmat_gpu.Init(&sketch_container.sketches_, p.max_bin);
// compare the cuts
double eps = 1e-2;
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index 37cce8b..5804786 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -77,7 +77,13 @@ void BuildGidx(DeviceShard<GradientSumT>* shard, int n_rows, int n_cols,
auto is_dense = (*dmat)->Info().num_nonzero_ ==
(*dmat)->Info().num_row_ * (*dmat)->Info().num_col_;
- shard->InitCompressedData(cmat, batch, is_dense);
+ size_t row_stride = 0;
+ const auto &offset_vec = batch.offset.HostVector();
+ for (size_t i = 1; i < offset_vec.size(); ++i) {
+ row_stride = std::max(row_stride, offset_vec[i] - offset_vec[i-1]);
+ }
+ shard->InitCompressedData(cmat, row_stride, is_dense);
+ shard->CreateHistIndices(batch, 0, std::vector<size_t>(1, batch.Size()), cmat.row_ptr.back());
delete dmat;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment