Created
May 3, 2019 15:17
-
-
Save sriramch/7d2494f6b2cc37ae57da6f0aa20da7cf to your computer and use it in GitHub Desktop.
training patch with perf improvements
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu | |
| index b4cc797..1981264 100644 | |
| --- a/src/common/hist_util.cu | |
| +++ b/src/common/hist_util.cu | |
| @@ -96,9 +96,8 @@ struct GPUSketcher { | |
| bool has_weights_{false}; | |
| tree::TrainParam param_; | |
| - std::vector<WXQSketch> sketches_; | |
| + SketchContainer &sketch_container_; | |
| thrust::device_vector<size_t> row_ptrs_; | |
| - std::vector<WXQSketch::SummaryContainer> summaries_; | |
| thrust::device_vector<Entry> entries_; | |
| thrust::device_vector<bst_float> fvalues_; | |
| thrust::device_vector<bst_float> feature_weights_; | |
| @@ -113,9 +112,9 @@ struct GPUSketcher { | |
| public: | |
| DeviceShard(int device, bst_uint row_begin, bst_uint row_end, | |
| - tree::TrainParam param) : | |
| + tree::TrainParam param, SketchContainer &sketch_container) : | |
| device_(device), row_begin_(row_begin), row_end_(row_end), | |
| - n_rows_(row_end - row_begin), param_(std::move(param)) { | |
| + n_rows_(row_end - row_begin), param_(std::move(param)), sketch_container_(sketch_container) { | |
| } | |
| void Init(const SparsePage& row_batch, const MetaInfo& info, int gpu_batch_nrows) { | |
| @@ -136,20 +135,10 @@ struct GPUSketcher { | |
| gpu_batch_nrows_ = n_rows_; | |
| } | |
| - // initialize sketches | |
| - sketches_.resize(num_cols_); | |
| - summaries_.resize(num_cols_); | |
| constexpr int kFactor = 8; | |
| - double eps = 1.0 / (kFactor * param_.max_bin); | |
| + double eps = 1.0 / (kFactor * param_.max_bin); | |
| size_t dummy_nlevel; | |
| - WXQSketch::LimitSizeLevel(row_batch.Size(), eps, &dummy_nlevel, &n_cuts_); | |
| - // double ncuts to be the same as the number of values | |
| - // in the temporary buffers of the sketches | |
| - n_cuts_ *= 2; | |
| - for (int icol = 0; icol < num_cols_; ++icol) { | |
| - sketches_[icol].Init(row_batch.Size(), eps); | |
| - summaries_[icol].Reserve(n_cuts_); | |
| - } | |
| + WXQSketch::LimitSizeLevel(gpu_batch_nrows_, eps, &dummy_nlevel, &n_cuts_); | |
| // allocate necessary GPU buffers | |
| dh::safe_cuda(cudaSetDevice(device_)); | |
| @@ -306,9 +295,12 @@ struct GPUSketcher { | |
| // unpack the features; also unpack weights if present | |
| thrust::fill(fvalues_.begin(), fvalues_.end(), NAN); | |
| - thrust::fill(feature_weights_.begin(), feature_weights_.end(), NAN); | |
| + if (has_weights_) { | |
| + thrust::fill(feature_weights_.begin(), feature_weights_.end(), NAN); | |
| + } | |
| - dim3 block3(64, 4, 1); | |
| + dim3 block3(16, 64, 1); | |
| + // NOTE: This will typically support ~ 4M features - 64K*64 | |
| dim3 grid3(dh::DivRoundUp(batch_nrows, block3.x), | |
| dh::DivRoundUp(num_cols_, block3.y), 1); | |
| UnpackFeaturesK<<<grid3, block3>>> | |
| @@ -324,9 +316,15 @@ struct GPUSketcher { | |
| // add cuts into sketches | |
| thrust::copy(cuts_d_.begin(), cuts_d_.end(), cuts_h_.begin()); | |
| +#pragma omp parallel for schedule(static) | |
| for (int icol = 0; icol < num_cols_; ++icol) { | |
| - summaries_[icol].MakeFromSorted(&cuts_h_[n_cuts_ * icol], n_cuts_cur_[icol]); | |
| - sketches_[icol].PushSummary(summaries_[icol]); | |
| + WXQSketch::SummaryContainer summary; | |
| + summary.Reserve(n_cuts_); | |
| + summary.MakeFromSorted(&cuts_h_[n_cuts_ * icol], n_cuts_cur_[icol]); | |
| + // Spin in user land until we we can grab this lock | |
| + while (sketch_container_.col_locks_[icol]->test_and_set()); | |
| + sketch_container_.sketches_[icol].PushSummary(summary); | |
| + sketch_container_.col_locks_[icol]->clear(); | |
| } | |
| } | |
| @@ -342,14 +340,10 @@ struct GPUSketcher { | |
| SketchBatch(row_batch, info, gpu_batch); | |
| } | |
| } | |
| - | |
| - void GetSummary(WXQSketch::SummaryContainer *summary, size_t const icol) { | |
| - sketches_[icol].GetSummary(summary); | |
| - } | |
| }; | |
| void Sketch(const SparsePage& batch, const MetaInfo& info, | |
| - HistCutMatrix* hmat, int gpu_batch_nrows) { | |
| + SketchContainer &sketch_container, int gpu_batch_nrows) { | |
| // create device shards | |
| shards_.resize(dist_.Devices().Size()); | |
| dh::ExecuteIndexShards(&shards_, [&](int i, std::unique_ptr<DeviceShard>& shard) { | |
| @@ -357,7 +351,7 @@ struct GPUSketcher { | |
| size_t size = dist_.ShardSize(info.num_row_, i); | |
| shard = std::unique_ptr<DeviceShard>( | |
| new DeviceShard(dist_.Devices().DeviceId(i), | |
| - start, start + size, param_)); | |
| + start, start + size, param_, sketch_container)); | |
| }); | |
| // compute sketches for each shard | |
| @@ -366,21 +360,6 @@ struct GPUSketcher { | |
| shard->Init(batch, info, gpu_batch_nrows); | |
| shard->Sketch(batch, info); | |
| }); | |
| - | |
| - // merge the sketches from all shards | |
| - // TODO(canonizer): do it in a tree-like reduction | |
| - int num_cols = info.num_col_; | |
| - std::vector<WXQSketch> sketches(num_cols); | |
| - WXQSketch::SummaryContainer summary; | |
| - for (int icol = 0; icol < num_cols; ++icol) { | |
| - sketches[icol].Init(batch.Size(), 1.0 / (8 * param_.max_bin)); | |
| - for (auto &shard : shards_) { | |
| - shard->GetSummary(&summary, icol); | |
| - sketches[icol].PushSummary(summary); | |
| - } | |
| - } | |
| - | |
| - hmat->Init(&sketches, param_.max_bin); | |
| } | |
| GPUSketcher(tree::TrainParam param, size_t n_rows) : param_(std::move(param)) { | |
| @@ -395,9 +374,10 @@ struct GPUSketcher { | |
| void DeviceSketch | |
| (const SparsePage& batch, const MetaInfo& info, | |
| - const tree::TrainParam& param, HistCutMatrix* hmat, int gpu_batch_nrows) { | |
| + const tree::TrainParam& param, SketchContainer &sketch_container, | |
| + int gpu_batch_nrows) { | |
| GPUSketcher sketcher(param, info.num_row_); | |
| - sketcher.Sketch(batch, info, hmat, gpu_batch_nrows); | |
| + sketcher.Sketch(batch, info, sketch_container, gpu_batch_nrows); | |
| } | |
| } // namespace common | |
| diff --git a/src/common/hist_util.h b/src/common/hist_util.h | |
| index 3aa2f23..2d0d10a 100644 | |
| --- a/src/common/hist_util.h | |
| +++ b/src/common/hist_util.h | |
| @@ -47,10 +47,24 @@ struct HistCutMatrix { | |
| Monitor monitor_; | |
| }; | |
| + | |
| +/*! | |
| + * \brief A container that holds the device sketches across all | |
| + * sparse page batches which are distributed to different devices. | |
| + * As sketches are aggregated by column, the atomic flag simulates | |
| + * a lock in user land when multiple devices could be pushing | |
| + * sketch summary for the same column across distinct rows. | |
| + */ | |
| +struct SketchContainer { | |
| + std::vector<HistCutMatrix::WXQSketch> sketches_; | |
| + std::vector<std::unique_ptr<std::atomic_flag>> col_locks_; | |
| +}; | |
| + | |
| /*! \brief Builds the cut matrix on the GPU */ | |
| void DeviceSketch | |
| (const SparsePage& batch, const MetaInfo& info, | |
| - const tree::TrainParam& param, HistCutMatrix* hmat, int gpu_batch_nrows); | |
| + const tree::TrainParam& param, SketchContainer &sketches, | |
| + int gpu_batch_nrows); | |
| /*! | |
| * \brief A single row in global histogram index. | |
| diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu | |
| index e93ca9a..573d17d 100644 | |
| --- a/src/tree/updater_gpu_hist.cu | |
| +++ b/src/tree/updater_gpu_hist.cu | |
| @@ -164,8 +164,10 @@ struct ELLPackMatrix { | |
| } | |
| void Init(common::Span<uint32_t> feature_segments, | |
| common::Span<bst_float> min_fvalue, | |
| - common::Span<bst_float> gidx_fvalue_map, size_t row_stride, | |
| - common::CompressedIterator<uint32_t> gidx_iter, bool is_dense, | |
| + common::Span<bst_float> gidx_fvalue_map, | |
| + size_t row_stride, | |
| + common::CompressedIterator<uint32_t> gidx_iter, | |
| + bool is_dense, | |
| int null_gidx_value) { | |
| this->feature_segments = feature_segments; | |
| this->min_fvalue = min_fvalue; | |
| @@ -646,8 +648,6 @@ struct DeviceShard { | |
| /*! \brief Sum gradient for each node. */ | |
| std::vector<GradientPair> node_sum_gradients; | |
| common::Span<GradientPair> node_sum_gradients_d; | |
| - /*! \brief row offset in SparsePage (the input data). */ | |
| - thrust::device_vector<size_t> row_ptrs; | |
| /*! \brief On-device feature set, only actually used on one of the devices */ | |
| thrust::device_vector<int> feature_set_d; | |
| thrust::device_vector<int64_t> | |
| @@ -656,6 +656,7 @@ struct DeviceShard { | |
| bst_uint row_begin_idx; | |
| bst_uint row_end_idx; | |
| bst_uint n_rows; | |
| + bst_uint rows_processed; | |
| TrainParam param; | |
| bool prediction_cache_initialised; | |
| @@ -674,36 +675,17 @@ struct DeviceShard { | |
| row_begin_idx(row_begin), | |
| row_end_idx(row_end), | |
| n_rows(row_end - row_begin), | |
| + rows_processed(0), | |
| n_bins(0), | |
| param(std::move(_param)), | |
| prediction_cache_initialised(false) {} | |
| - /* Init row_ptrs and row_stride */ | |
| - size_t InitRowPtrs(const SparsePage& row_batch) { | |
| - const auto& offset_vec = row_batch.offset.HostVector(); | |
| - row_ptrs.resize(n_rows + 1); | |
| - thrust::copy(offset_vec.data() + row_begin_idx, | |
| - offset_vec.data() + row_end_idx + 1, | |
| - row_ptrs.begin()); | |
| - auto row_iter = row_ptrs.begin(); | |
| - // find the maximum row size for converting to ELLPack | |
| - auto get_size = [=] __device__(size_t row) { | |
| - return row_iter[row + 1] - row_iter[row]; | |
| - }; // NOLINT | |
| - | |
| - auto counting = thrust::make_counting_iterator(size_t(0)); | |
| - using TransformT = thrust::transform_iterator<decltype(get_size), | |
| - decltype(counting), size_t>; | |
| - TransformT row_size_iter = TransformT(counting, get_size); | |
| - size_t row_stride = thrust::reduce(row_size_iter, row_size_iter + n_rows, 0, | |
| - thrust::maximum<size_t>()); | |
| - return row_stride; | |
| - } | |
| - | |
| void InitCompressedData( | |
| - const common::HistCutMatrix& hmat, const SparsePage& row_batch, bool is_dense); | |
| + const common::HistCutMatrix& hmat, size_t row_stride, bool is_dense); | |
| - void CreateHistIndices(const SparsePage& row_batch, size_t row_stride, int null_gidx_value); | |
| + void CreateHistIndices( | |
| + const SparsePage& row_batch, size_t shard_idx, | |
| + const std::vector<size_t> &shard_allocations, int null_gidx_value); | |
| ~DeviceShard() { | |
| dh::safe_cuda(cudaSetDevice(device_id)); | |
| @@ -1045,11 +1027,13 @@ struct GlobalMemHistBuilder : public GPUHistBuilderBase<GradientSumT> { | |
| template <typename GradientSumT> | |
| inline void DeviceShard<GradientSumT>::InitCompressedData( | |
| - const common::HistCutMatrix& hmat, const SparsePage& row_batch, bool is_dense) { | |
| - size_t row_stride = this->InitRowPtrs(row_batch); | |
| + const common::HistCutMatrix& hmat, size_t row_stride, bool is_dense) { | |
| n_bins = hmat.row_ptr.back(); | |
| int null_gidx_value = hmat.row_ptr.back(); | |
| + CHECK(!(param.max_leaves == 0 && param.max_depth == 0)) | |
| + << "Max leaves and max depth cannot both be unconstrained for " | |
| + "gpu_hist."; | |
| int max_nodes = | |
| param.max_leaves > 0 ? param.max_leaves * 2 : MaxNodesDepth(param.max_depth); | |
| @@ -1072,24 +1056,17 @@ inline void DeviceShard<GradientSumT>::InitCompressedData( | |
| node_sum_gradients.resize(max_nodes); | |
| ridx_segments.resize(max_nodes); | |
| - | |
| // allocate compressed bin data | |
| int num_symbols = n_bins + 1; | |
| // Required buffer size for storing data matrix in ELLPack format. | |
| size_t compressed_size_bytes = | |
| common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows, | |
| num_symbols); | |
| - | |
| - CHECK(!(param.max_leaves == 0 && param.max_depth == 0)) | |
| - << "Max leaves and max depth cannot both be unconstrained for " | |
| - "gpu_hist."; | |
| ba.Allocate(device_id, &gidx_buffer, compressed_size_bytes); | |
| thrust::fill( | |
| thrust::device_pointer_cast(gidx_buffer.data()), | |
| thrust::device_pointer_cast(gidx_buffer.data() + gidx_buffer.size()), 0); | |
| - this->CreateHistIndices(row_batch, row_stride, null_gidx_value); | |
| - | |
| ellpack_matrix.Init( | |
| feature_segments, min_fvalue, | |
| gidx_fvalue_map, row_stride, | |
| @@ -1113,23 +1090,42 @@ inline void DeviceShard<GradientSumT>::InitCompressedData( | |
| template <typename GradientSumT> | |
| inline void DeviceShard<GradientSumT>::CreateHistIndices( | |
| - const SparsePage& row_batch, size_t row_stride, int null_gidx_value) { | |
| + const SparsePage& row_batch, | |
| + size_t shard_idx, | |
| + const std::vector<size_t> &shard_allocations, | |
| + int null_gidx_value) { | |
| + // Has any been allocated for me in this batch? | |
| + size_t num_elems = shard_allocations[shard_idx]; | |
| + if (!num_elems) return; | |
| + | |
| + size_t row_stride = this->ellpack_matrix.row_stride; | |
| + | |
| + // Take into account the rows that could be processed by other GPUs | |
| + size_t begin_idx(0); | |
| + for (size_t i = 0; i < shard_idx; ++i) begin_idx += shard_allocations[i]; | |
| + | |
| + const auto& offset_vec = row_batch.offset.HostVector(); | |
| + /*! \brief row offset in SparsePage (the input data). */ | |
| + thrust::device_vector<size_t> row_ptrs(num_elems+1); | |
| + thrust::copy(offset_vec.data() + begin_idx, | |
| + offset_vec.data() + begin_idx + num_elems + 1, | |
| + row_ptrs.begin()); | |
| + | |
| int num_symbols = n_bins + 1; | |
| // bin and compress entries in batches of rows | |
| size_t gpu_batch_nrows = | |
| std::min | |
| (dh::TotalMemory(device_id) / (16 * row_stride * sizeof(Entry)), | |
| - static_cast<size_t>(n_rows)); | |
| + static_cast<size_t>(num_elems)); | |
| const std::vector<Entry>& data_vec = row_batch.data.HostVector(); | |
| thrust::device_vector<Entry> entries_d(gpu_batch_nrows * row_stride); | |
| - size_t gpu_nbatches = dh::DivRoundUp(n_rows, gpu_batch_nrows); | |
| - | |
| + size_t gpu_nbatches = dh::DivRoundUp(num_elems, gpu_batch_nrows); | |
| for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) { | |
| size_t batch_row_begin = gpu_batch * gpu_batch_nrows; | |
| size_t batch_row_end = (gpu_batch + 1) * gpu_batch_nrows; | |
| - if (batch_row_end > n_rows) { | |
| - batch_row_end = n_rows; | |
| + if (batch_row_end > num_elems) { | |
| + batch_row_end = num_elems; | |
| } | |
| size_t batch_nrows = batch_row_end - batch_row_begin; | |
| // number of entries in this batch. | |
| @@ -1140,19 +1136,25 @@ inline void DeviceShard<GradientSumT>::CreateHistIndices( | |
| (entries_d.data().get(), data_vec.data() + row_ptrs[batch_row_begin], | |
| n_entries * sizeof(Entry), cudaMemcpyDefault)); | |
| const dim3 block3(32, 8, 1); // 256 threads | |
| - const dim3 grid3(dh::DivRoundUp(n_rows, block3.x), | |
| + const dim3 grid3(dh::DivRoundUp(num_elems, block3.x), | |
| dh::DivRoundUp(row_stride, block3.y), 1); | |
| CompressBinEllpackKernel<<<grid3, block3>>> | |
| (common::CompressedBufferWriter(num_symbols), | |
| gidx_buffer.data(), | |
| row_ptrs.data().get() + batch_row_begin, | |
| entries_d.data().get(), | |
| - gidx_fvalue_map.data(), feature_segments.data(), | |
| - batch_row_begin, batch_nrows, | |
| + gidx_fvalue_map.data(), | |
| + feature_segments.data(), | |
| + rows_processed + batch_row_begin, | |
| + batch_nrows, | |
| row_ptrs[batch_row_begin], | |
| - row_stride, null_gidx_value); | |
| + row_stride, | |
| + null_gidx_value); | |
| } | |
| + // This will be the offset into the gidx_buffer for the next batch | |
| + rows_processed += num_elems; | |
| + | |
| // free the memory that is no longer needed | |
| row_ptrs.resize(0); | |
| row_ptrs.shrink_to_fit(); | |
| @@ -1216,8 +1218,6 @@ class GPUHistMakerSpecialised{ | |
| reducer_.Init(device_list_); | |
| - auto batch_iter = dmat->GetRowBatches().begin(); | |
| - const SparsePage& batch = *batch_iter; | |
| // Create device shards | |
| shards_.resize(n_devices); | |
| dh::ExecuteIndexShards( | |
| @@ -1231,23 +1231,99 @@ class GPUHistMakerSpecialised{ | |
| start + size, param_)); | |
| }); | |
| - // Find the cuts. | |
| + // Initialize Sketches across all batches | |
| + common::SketchContainer sketch_container; | |
| + sketch_container.sketches_.resize(info_->num_col_); | |
| + sketch_container.col_locks_.resize(info_->num_col_); | |
| +#pragma omp parallel for schedule(static) | |
| + for (int icol = 0; icol < info_->num_col_; ++icol) { | |
| + sketch_container.sketches_[icol].Init(info_->num_row_, 1.0 / (8 * param_.max_bin)); | |
| + sketch_container.col_locks_[icol].reset(new std::atomic_flag(ATOMIC_FLAG_INIT)); | |
| + } | |
| + | |
| + size_t row_stride(0); // Max row_stride across the entire dataset | |
| + size_t processed_size(0); | |
| + | |
| + // Rows allocated to each GPU in a batch | |
| + std::vector<std::vector<size_t>> shard_allocations; | |
| monitor_.StartCuda("Quantiles"); | |
| - common::DeviceSketch(batch, *info_, param_, &hmat_, hist_maker_param_.gpu_batch_nrows); | |
| - n_bins_ = hmat_.row_ptr.back(); | |
| + for (const auto &batch : dmat->GetRowBatches()) { | |
| + struct RowReset { | |
| + public: | |
| + RowReset(MetaInfo &minfo, const SparsePage &r_batch) | |
| + : minfo_(minfo) { | |
| + orig_nrows_ = minfo_.num_row_; | |
| + minfo.num_row_ = r_batch.Size(); | |
| + } | |
| + | |
| + ~RowReset() { | |
| + minfo_.num_row_ = orig_nrows_; | |
| + } | |
| + private: | |
| + MetaInfo &minfo_; | |
| + size_t orig_nrows_; | |
| + }rowgd(*info_, batch); | |
| + | |
| + // Find the cuts for each batch | |
| + common::DeviceSketch(batch, *info_, param_, sketch_container, hist_maker_param_.gpu_batch_nrows); | |
| + | |
| + const auto &offset_vec = batch.offset.HostVector(); | |
| + for (size_t i = 1; i < offset_vec.size(); ++i) { | |
| + row_stride = std::max(row_stride, offset_vec[i] - offset_vec[i-1]); | |
| + } | |
| + | |
| + std::vector<size_t> batch_allocation(shards_.size()); | |
| + // Distribute the rows in this batch to the different shards | |
| + for (size_t i = 0; i < shards_.size(); ++i) { | |
| + // Does this batch pertain to me? | |
| + ssize_t num_elems = processed_size + batch.Size() - shards_[i]->row_begin_idx; | |
| + ssize_t rem_elems = shards_[i]->row_end_idx - processed_size; | |
| + if (num_elems <= 0 || rem_elems <= 0) batch_allocation[i] = 0; | |
| + // How many elements do I process from this batch? | |
| + else { | |
| + num_elems = std::min( | |
| + std::min( | |
| + std::min(num_elems, rem_elems), | |
| + static_cast<ssize_t>(shards_[i]->n_rows)), | |
| + static_cast<ssize_t>(batch.Size())); | |
| + batch_allocation[i] = num_elems; | |
| + } | |
| + } | |
| + shard_allocations.push_back(batch_allocation); | |
| + | |
| + processed_size += batch.Size(); | |
| + } | |
| monitor_.StopCuda("Quantiles"); | |
| + | |
| + // Initialize hmat_ for the entire data set | |
| + hmat_.Init(&sketch_container.sketches_, param_.max_bin); | |
| + | |
| + n_bins_ = hmat_.row_ptr.back(); | |
| + | |
| auto is_dense = info_->num_nonzero_ == info_->num_row_ * info_->num_col_; | |
| - monitor_.StartCuda("BinningCompression"); | |
| + // Init global data for each shard | |
| + monitor_.StartCuda("InitCompressedData"); | |
| dh::ExecuteIndexShards( | |
| &shards_, | |
| - [&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) { | |
| - dh::safe_cuda(cudaSetDevice(shard->device_id)); | |
| - shard->InitCompressedData(hmat_, batch, is_dense); | |
| + [&](int i, std::unique_ptr<DeviceShard<GradientSumT>>& shard) { | |
| + dh::safe_cuda(cudaSetDevice(dist_.Devices().DeviceId(i))); | |
| + shard->InitCompressedData(hmat_, row_stride, is_dense); | |
| }); | |
| + monitor_.StopCuda("InitCompressedData"); | |
| + | |
| + monitor_.StartCuda("BinningCompression"); | |
| + size_t batch_idx(0); | |
| + for (const auto &batch : dmat->GetRowBatches()) { | |
| + dh::ExecuteIndexShards( | |
| + &shards_, | |
| + [&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) { | |
| + dh::safe_cuda(cudaSetDevice(shard->device_id)); | |
| + shard->CreateHistIndices(batch, idx, shard_allocations[batch_idx], hmat_.row_ptr.back()); | |
| + }); | |
| + batch_idx++; | |
| + } | |
| monitor_.StopCuda("BinningCompression"); | |
| - ++batch_iter; | |
| - CHECK(batch_iter.AtEnd()) << "External memory not supported"; | |
| p_last_fmat_ = dmat; | |
| initialised_ = true; | |
| diff --git a/tests/cpp/common/test_gpu_hist_util.cu b/tests/cpp/common/test_gpu_hist_util.cu | |
| index 7c4fbd7..350ec42 100644 | |
| --- a/tests/cpp/common/test_gpu_hist_util.cu | |
| +++ b/tests/cpp/common/test_gpu_hist_util.cu | |
| @@ -39,7 +39,16 @@ void TestDeviceSketch(const GPUSet& devices) { | |
| // find the cuts on the GPU | |
| const SparsePage& batch = *(*dmat)->GetRowBatches().begin(); | |
| HistCutMatrix hmat_gpu; | |
| - DeviceSketch(batch, (*dmat)->Info(), p, &hmat_gpu, gpu_batch_nrows); | |
| + common::SketchContainer sketch_container; | |
| + sketch_container.sketches_.resize((*dmat)->Info().num_col_); | |
| + sketch_container.col_locks_.resize((*dmat)->Info().num_col_); | |
| + for (int icol = 0; icol < (*dmat)->Info().num_col_; ++icol) { | |
| + sketch_container.sketches_[icol].Init((*dmat)->Info().num_row_, 1.0 / (8 * p.max_bin)); | |
| + sketch_container.col_locks_[icol].reset(new std::atomic_flag(ATOMIC_FLAG_INIT)); | |
| + } | |
| + | |
| + DeviceSketch(batch, (*dmat)->Info(), p, sketch_container, gpu_batch_nrows); | |
| + hmat_gpu.Init(&sketch_container.sketches_, p.max_bin); | |
| // compare the cuts | |
| double eps = 1e-2; | |
| diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu | |
| index 37cce8b..5804786 100644 | |
| --- a/tests/cpp/tree/test_gpu_hist.cu | |
| +++ b/tests/cpp/tree/test_gpu_hist.cu | |
| @@ -77,7 +77,13 @@ void BuildGidx(DeviceShard<GradientSumT>* shard, int n_rows, int n_cols, | |
| auto is_dense = (*dmat)->Info().num_nonzero_ == | |
| (*dmat)->Info().num_row_ * (*dmat)->Info().num_col_; | |
| - shard->InitCompressedData(cmat, batch, is_dense); | |
| + size_t row_stride = 0; | |
| + const auto &offset_vec = batch.offset.HostVector(); | |
| + for (size_t i = 1; i < offset_vec.size(); ++i) { | |
| + row_stride = std::max(row_stride, offset_vec[i] - offset_vec[i-1]); | |
| + } | |
| + shard->InitCompressedData(cmat, row_stride, is_dense); | |
| + shard->CreateHistIndices(batch, 0, std::vector<size_t>(1, batch.Size()), cmat.row_ptr.back()); | |
| delete dmat; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment