sriramch · May 3, 2019 15:17
diff --git a/training_perf_patch b/training_perf_patch
 diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu
 index b4cc797..1981264 100644
 --- a/src/common/hist_util.cu
 +++ b/src/common/hist_util.cu
 @@ -96,9 +96,8 @@ struct GPUSketcher {
     bool has_weights_{false};
 
     tree::TrainParam param_;
 -    std::vector<WXQSketch> sketches_;
 +    SketchContainer &sketch_container_;
     thrust::device_vector<size_t> row_ptrs_;
 -    std::vector<WXQSketch::SummaryContainer> summaries_;
     thrust::device_vector<Entry> entries_;
     thrust::device_vector<bst_float> fvalues_;
     thrust::device_vector<bst_float> feature_weights_;
 @@ -113,9 +112,9 @@ struct GPUSketcher {
 
    public:
     DeviceShard(int device, bst_uint row_begin, bst_uint row_end,
 -                tree::TrainParam param) :
 +                tree::TrainParam param, SketchContainer &sketch_container) :
       device_(device), row_begin_(row_begin), row_end_(row_end),
 -      n_rows_(row_end - row_begin), param_(std::move(param)) {
 +      n_rows_(row_end - row_begin), param_(std::move(param)), sketch_container_(sketch_container) {
     }
 
     void Init(const SparsePage& row_batch, const MetaInfo& info, int gpu_batch_nrows) {
 @@ -136,20 +135,10 @@ struct GPUSketcher {
         gpu_batch_nrows_ = n_rows_;
       }
 
 -      // initialize sketches
 -      sketches_.resize(num_cols_);
 -      summaries_.resize(num_cols_);
       constexpr int kFactor = 8;
 -      double eps = 1.0 / (kFactor * param_.max_bin);
 +      double eps = 1.0 / (kFactor * param_.max_bin);                                                            
       size_t dummy_nlevel;
 -      WXQSketch::LimitSizeLevel(row_batch.Size(), eps, &dummy_nlevel, &n_cuts_);
 -      // double ncuts to be the same as the number of values
 -      // in the temporary buffers of the sketches
 -      n_cuts_ *= 2;
 -      for (int icol = 0; icol < num_cols_; ++icol) {
 -        sketches_[icol].Init(row_batch.Size(), eps);
 -        summaries_[icol].Reserve(n_cuts_);
 -      }
 +      WXQSketch::LimitSizeLevel(gpu_batch_nrows_, eps, &dummy_nlevel, &n_cuts_);
 
       // allocate necessary GPU buffers
       dh::safe_cuda(cudaSetDevice(device_));
 @@ -306,9 +295,12 @@ struct GPUSketcher {
 
       // unpack the features; also unpack weights if present
       thrust::fill(fvalues_.begin(), fvalues_.end(), NAN);
 -      thrust::fill(feature_weights_.begin(), feature_weights_.end(), NAN);
 +      if (has_weights_) {
 +        thrust::fill(feature_weights_.begin(), feature_weights_.end(), NAN);
 +      }
 
 -      dim3 block3(64, 4, 1);
 +      dim3 block3(16, 64, 1);
 +      // NOTE: This will typically support ~ 4M features - 64K*64
       dim3 grid3(dh::DivRoundUp(batch_nrows, block3.x),
                  dh::DivRoundUp(num_cols_, block3.y), 1);
       UnpackFeaturesK<<<grid3, block3>>>
 @@ -324,9 +316,15 @@ struct GPUSketcher {
 
       // add cuts into sketches
       thrust::copy(cuts_d_.begin(), cuts_d_.end(), cuts_h_.begin());
 +#pragma omp parallel for schedule(static)
       for (int icol = 0; icol < num_cols_; ++icol) {
 -        summaries_[icol].MakeFromSorted(&cuts_h_[n_cuts_ * icol], n_cuts_cur_[icol]);
 -        sketches_[icol].PushSummary(summaries_[icol]);
 +        WXQSketch::SummaryContainer summary;
 +        summary.Reserve(n_cuts_);
 +        summary.MakeFromSorted(&cuts_h_[n_cuts_ * icol], n_cuts_cur_[icol]);
 +        // Spin in user land until we we can grab this lock
 +        while (sketch_container_.col_locks_[icol]->test_and_set());
 +        sketch_container_.sketches_[icol].PushSummary(summary);
 +        sketch_container_.col_locks_[icol]->clear();
       }
     }
 
 @@ -342,14 +340,10 @@ struct GPUSketcher {
         SketchBatch(row_batch, info, gpu_batch);
       }
     }
 -
 -    void GetSummary(WXQSketch::SummaryContainer *summary, size_t const icol) {
 -      sketches_[icol].GetSummary(summary);
 -    }
   };
 
   void Sketch(const SparsePage& batch, const MetaInfo& info,
 -              HistCutMatrix* hmat, int gpu_batch_nrows) {
 +              SketchContainer &sketch_container, int gpu_batch_nrows) {
     // create device shards
     shards_.resize(dist_.Devices().Size());
     dh::ExecuteIndexShards(&shards_, [&](int i, std::unique_ptr<DeviceShard>& shard) {
 @@ -357,7 +351,7 @@ struct GPUSketcher {
         size_t size = dist_.ShardSize(info.num_row_, i);
         shard = std::unique_ptr<DeviceShard>(
             new DeviceShard(dist_.Devices().DeviceId(i),
 -                            start, start + size, param_));
 +                            start, start + size, param_, sketch_container));
       });
 
     // compute sketches for each shard
 @@ -366,21 +360,6 @@ struct GPUSketcher {
                              shard->Init(batch, info, gpu_batch_nrows);
                              shard->Sketch(batch, info);
                            });
 -
 -    // merge the sketches from all shards
 -    // TODO(canonizer): do it in a tree-like reduction
 -    int num_cols = info.num_col_;
 -    std::vector<WXQSketch> sketches(num_cols);
 -    WXQSketch::SummaryContainer summary;
 -    for (int icol = 0; icol < num_cols; ++icol) {
 -      sketches[icol].Init(batch.Size(), 1.0 / (8 * param_.max_bin));
 -      for (auto &shard : shards_) {
 -        shard->GetSummary(&summary, icol);
 -        sketches[icol].PushSummary(summary);
 -      }
 -    }
 -
 -    hmat->Init(&sketches, param_.max_bin);
   }
 
   GPUSketcher(tree::TrainParam param, size_t n_rows) : param_(std::move(param)) {
 @@ -395,9 +374,10 @@ struct GPUSketcher {
 
 void DeviceSketch
   (const SparsePage& batch, const MetaInfo& info,
 -   const tree::TrainParam& param, HistCutMatrix* hmat, int gpu_batch_nrows) {
 +   const tree::TrainParam& param, SketchContainer &sketch_container,
 +   int gpu_batch_nrows) {
   GPUSketcher sketcher(param, info.num_row_);
 -  sketcher.Sketch(batch, info, hmat, gpu_batch_nrows);
 +  sketcher.Sketch(batch, info, sketch_container, gpu_batch_nrows);
 }
 
 }  // namespace common
 diff --git a/src/common/hist_util.h b/src/common/hist_util.h
 index 3aa2f23..2d0d10a 100644
 --- a/src/common/hist_util.h
 +++ b/src/common/hist_util.h
 @@ -47,10 +47,24 @@ struct HistCutMatrix {
   Monitor monitor_;
 };
 
 +
 +/*!
 + * \brief A container that holds the device sketches across all
 + *  sparse page batches which are distributed to different devices.
 + *  As sketches are aggregated by column, the atomic flag simulates
 + *  a lock in user land when multiple devices could be pushing
 + *  sketch summary for the same column across distinct rows.
 + */
 +struct SketchContainer {
 +  std::vector<HistCutMatrix::WXQSketch> sketches_;
 +  std::vector<std::unique_ptr<std::atomic_flag>> col_locks_;
 +};
 +
 /*! \brief Builds the cut matrix on the GPU */
 void DeviceSketch
   (const SparsePage& batch, const MetaInfo& info,
 -   const tree::TrainParam& param, HistCutMatrix* hmat, int gpu_batch_nrows);
 +   const tree::TrainParam& param, SketchContainer &sketches,
 +   int gpu_batch_nrows);
 
 /*!
  * \brief A single row in global histogram index.
 diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
 index e93ca9a..573d17d 100644
 --- a/src/tree/updater_gpu_hist.cu
 +++ b/src/tree/updater_gpu_hist.cu
 @@ -164,8 +164,10 @@ struct ELLPackMatrix {
   }
   void Init(common::Span<uint32_t> feature_segments,
     common::Span<bst_float> min_fvalue,
 -    common::Span<bst_float> gidx_fvalue_map, size_t row_stride,
 -    common::CompressedIterator<uint32_t> gidx_iter, bool is_dense,
 +    common::Span<bst_float> gidx_fvalue_map,
 +    size_t row_stride,
 +    common::CompressedIterator<uint32_t> gidx_iter,
 +    bool is_dense,
     int null_gidx_value) {
     this->feature_segments = feature_segments;
     this->min_fvalue = min_fvalue;
 @@ -646,8 +648,6 @@ struct DeviceShard {
   /*! \brief Sum gradient for each node. */
   std::vector<GradientPair> node_sum_gradients;
   common::Span<GradientPair> node_sum_gradients_d;
 -  /*! \brief row offset in SparsePage (the input data). */
 -  thrust::device_vector<size_t> row_ptrs;
   /*! \brief On-device feature set, only actually used on one of the devices */
   thrust::device_vector<int> feature_set_d;
   thrust::device_vector<int64_t>
 @@ -656,6 +656,7 @@ struct DeviceShard {
   bst_uint row_begin_idx;
   bst_uint row_end_idx;
   bst_uint n_rows;
 +  bst_uint rows_processed;
 
   TrainParam param;
   bool prediction_cache_initialised;
 @@ -674,36 +675,17 @@ struct DeviceShard {
         row_begin_idx(row_begin),
         row_end_idx(row_end),
         n_rows(row_end - row_begin),
 +        rows_processed(0),
         n_bins(0),
         param(std::move(_param)),
         prediction_cache_initialised(false) {}
 
 -  /* Init row_ptrs and row_stride */
 -  size_t InitRowPtrs(const SparsePage& row_batch) {
 -    const auto& offset_vec = row_batch.offset.HostVector();
 -    row_ptrs.resize(n_rows + 1);
 -    thrust::copy(offset_vec.data() + row_begin_idx,
 -                 offset_vec.data() + row_end_idx + 1,
 -                 row_ptrs.begin());
 -    auto row_iter = row_ptrs.begin();
 -    // find the maximum row size for converting to ELLPack
 -    auto get_size = [=] __device__(size_t row) {
 -      return row_iter[row + 1] - row_iter[row];
 -    }; // NOLINT
 -
 -    auto counting = thrust::make_counting_iterator(size_t(0));
 -    using TransformT = thrust::transform_iterator<decltype(get_size),
 -                                                  decltype(counting), size_t>;
 -    TransformT row_size_iter = TransformT(counting, get_size);
 -    size_t row_stride = thrust::reduce(row_size_iter, row_size_iter + n_rows, 0,
 -                                       thrust::maximum<size_t>());
 -    return row_stride;
 -  }
 -
   void InitCompressedData(
 -      const common::HistCutMatrix& hmat, const SparsePage& row_batch, bool is_dense);
 +      const common::HistCutMatrix& hmat, size_t row_stride, bool is_dense);
 
 -  void CreateHistIndices(const SparsePage& row_batch, size_t row_stride, int null_gidx_value);
 +  void CreateHistIndices(
 +      const SparsePage& row_batch, size_t shard_idx,
 +      const std::vector<size_t> &shard_allocations, int null_gidx_value);
 
   ~DeviceShard() {
     dh::safe_cuda(cudaSetDevice(device_id));
 @@ -1045,11 +1027,13 @@ struct GlobalMemHistBuilder : public GPUHistBuilderBase<GradientSumT> {
 
 template <typename GradientSumT>
 inline void DeviceShard<GradientSumT>::InitCompressedData(
 -    const common::HistCutMatrix& hmat, const SparsePage& row_batch, bool is_dense) {
 -  size_t row_stride = this->InitRowPtrs(row_batch);
 +    const common::HistCutMatrix& hmat, size_t row_stride, bool is_dense) {
   n_bins = hmat.row_ptr.back();
   int null_gidx_value = hmat.row_ptr.back();
 
 +  CHECK(!(param.max_leaves == 0 && param.max_depth == 0))
 +      << "Max leaves and max depth cannot both be unconstrained for "
 +      "gpu_hist.";
   int max_nodes =
       param.max_leaves > 0 ? param.max_leaves * 2 : MaxNodesDepth(param.max_depth);
 
 @@ -1072,24 +1056,17 @@ inline void DeviceShard<GradientSumT>::InitCompressedData(
   node_sum_gradients.resize(max_nodes);
   ridx_segments.resize(max_nodes);
 
 -
   // allocate compressed bin data
   int num_symbols = n_bins + 1;
   // Required buffer size for storing data matrix in ELLPack format.
   size_t compressed_size_bytes =
       common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows,
                                                           num_symbols);
 -
 -  CHECK(!(param.max_leaves == 0 && param.max_depth == 0))
 -      << "Max leaves and max depth cannot both be unconstrained for "
 -      "gpu_hist.";
   ba.Allocate(device_id, &gidx_buffer, compressed_size_bytes);
   thrust::fill(
       thrust::device_pointer_cast(gidx_buffer.data()),
       thrust::device_pointer_cast(gidx_buffer.data() + gidx_buffer.size()), 0);
 
 -  this->CreateHistIndices(row_batch, row_stride, null_gidx_value);
 -
   ellpack_matrix.Init(
       feature_segments, min_fvalue,
       gidx_fvalue_map, row_stride,
 @@ -1113,23 +1090,42 @@ inline void DeviceShard<GradientSumT>::InitCompressedData(
 
 template <typename GradientSumT>
 inline void DeviceShard<GradientSumT>::CreateHistIndices(
 -    const SparsePage& row_batch, size_t row_stride, int null_gidx_value) {
 +    const SparsePage& row_batch,
 +    size_t shard_idx,
 +    const std::vector<size_t> &shard_allocations,
 +    int null_gidx_value) {
 +  // Has any been allocated for me in this batch?
 +  size_t num_elems = shard_allocations[shard_idx];
 +  if (!num_elems) return;
 +
 +  size_t row_stride = this->ellpack_matrix.row_stride;
 +
 +  // Take into account the rows that could be processed by other GPUs
 +  size_t begin_idx(0);
 +  for (size_t i = 0; i < shard_idx; ++i) begin_idx += shard_allocations[i];
 +
 +  const auto& offset_vec = row_batch.offset.HostVector();
 +  /*! \brief row offset in SparsePage (the input data). */
 +  thrust::device_vector<size_t> row_ptrs(num_elems+1);
 +  thrust::copy(offset_vec.data() + begin_idx,
 +               offset_vec.data() + begin_idx + num_elems + 1,
 +               row_ptrs.begin());
 +
   int num_symbols = n_bins + 1;
   // bin and compress entries in batches of rows
   size_t gpu_batch_nrows =
       std::min
       (dh::TotalMemory(device_id) / (16 * row_stride * sizeof(Entry)),
 -       static_cast<size_t>(n_rows));
 +       static_cast<size_t>(num_elems));
   const std::vector<Entry>& data_vec = row_batch.data.HostVector();
 
   thrust::device_vector<Entry> entries_d(gpu_batch_nrows * row_stride);
 -  size_t gpu_nbatches = dh::DivRoundUp(n_rows, gpu_batch_nrows);
 -
 +  size_t gpu_nbatches = dh::DivRoundUp(num_elems, gpu_batch_nrows);
   for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) {
     size_t batch_row_begin = gpu_batch * gpu_batch_nrows;
     size_t batch_row_end = (gpu_batch + 1) * gpu_batch_nrows;
 -    if (batch_row_end > n_rows) {
 -      batch_row_end = n_rows;
 +    if (batch_row_end > num_elems) {
 +      batch_row_end = num_elems;
     }
     size_t batch_nrows = batch_row_end - batch_row_begin;
     // number of entries in this batch.
 @@ -1140,19 +1136,25 @@ inline void DeviceShard<GradientSumT>::CreateHistIndices(
          (entries_d.data().get(), data_vec.data() + row_ptrs[batch_row_begin],
           n_entries * sizeof(Entry), cudaMemcpyDefault));
     const dim3 block3(32, 8, 1);  // 256 threads
 -    const dim3 grid3(dh::DivRoundUp(n_rows, block3.x),
 +    const dim3 grid3(dh::DivRoundUp(num_elems, block3.x),
                      dh::DivRoundUp(row_stride, block3.y), 1);
     CompressBinEllpackKernel<<<grid3, block3>>>
         (common::CompressedBufferWriter(num_symbols),
          gidx_buffer.data(),
          row_ptrs.data().get() + batch_row_begin,
          entries_d.data().get(),
 -         gidx_fvalue_map.data(), feature_segments.data(),
 -         batch_row_begin, batch_nrows,
 +         gidx_fvalue_map.data(),
 +         feature_segments.data(),
 +         rows_processed + batch_row_begin,
 +         batch_nrows,
          row_ptrs[batch_row_begin],
 -         row_stride, null_gidx_value);
 +         row_stride,
 +         null_gidx_value);
   }
 
 +  // This will be the offset into the gidx_buffer for the next batch
 +  rows_processed += num_elems;
 +
   // free the memory that is no longer needed
   row_ptrs.resize(0);
   row_ptrs.shrink_to_fit();
 @@ -1216,8 +1218,6 @@ class GPUHistMakerSpecialised{
 
     reducer_.Init(device_list_);
 
 -    auto batch_iter = dmat->GetRowBatches().begin();
 -    const SparsePage& batch = *batch_iter;
     // Create device shards
     shards_.resize(n_devices);
     dh::ExecuteIndexShards(
 @@ -1231,23 +1231,99 @@ class GPUHistMakerSpecialised{
                                             start + size, param_));
         });
 
 -    // Find the cuts.
 +    // Initialize Sketches across all batches
 +    common::SketchContainer sketch_container;
 +    sketch_container.sketches_.resize(info_->num_col_);
 +    sketch_container.col_locks_.resize(info_->num_col_);
 +#pragma omp parallel for schedule(static)
 +    for (int icol = 0; icol < info_->num_col_; ++icol) {
 +      sketch_container.sketches_[icol].Init(info_->num_row_, 1.0 / (8 * param_.max_bin));
 +      sketch_container.col_locks_[icol].reset(new std::atomic_flag(ATOMIC_FLAG_INIT));
 +    }
 +
 +    size_t row_stride(0); // Max row_stride across the entire dataset
 +    size_t processed_size(0);
 +
 +    // Rows allocated to each GPU in a batch
 +    std::vector<std::vector<size_t>> shard_allocations;
     monitor_.StartCuda("Quantiles");
 -    common::DeviceSketch(batch, *info_, param_, &hmat_, hist_maker_param_.gpu_batch_nrows);
 -    n_bins_ = hmat_.row_ptr.back();
 +    for (const auto &batch : dmat->GetRowBatches()) {
 +      struct RowReset {
 +        public:
 +          RowReset(MetaInfo &minfo, const SparsePage &r_batch)
 +            : minfo_(minfo) {
 +              orig_nrows_ = minfo_.num_row_;
 +              minfo.num_row_ = r_batch.Size();
 +          }
 +
 +          ~RowReset() {
 +             minfo_.num_row_ = orig_nrows_;
 +          }
 +        private:
 +          MetaInfo &minfo_;
 +          size_t orig_nrows_;
 +      }rowgd(*info_, batch);
 +
 +      // Find the cuts for each batch
 +      common::DeviceSketch(batch, *info_, param_, sketch_container, hist_maker_param_.gpu_batch_nrows);
 +
 +      const auto &offset_vec = batch.offset.HostVector();
 +      for (size_t i = 1; i < offset_vec.size(); ++i) {
 +        row_stride = std::max(row_stride, offset_vec[i] - offset_vec[i-1]);
 +      }
 +
 +      std::vector<size_t> batch_allocation(shards_.size());
 +      // Distribute the rows in this batch to the different shards
 +      for (size_t i = 0; i < shards_.size(); ++i) {
 +        // Does this batch pertain to me?
 +        ssize_t num_elems = processed_size + batch.Size() - shards_[i]->row_begin_idx;
 +        ssize_t rem_elems = shards_[i]->row_end_idx - processed_size;
 +        if (num_elems <= 0 || rem_elems <= 0) batch_allocation[i] = 0;
 +        // How many elements do I process from this batch?
 +        else {
 +          num_elems = std::min(
 +                        std::min(
 +                          std::min(num_elems, rem_elems),
 +                          static_cast<ssize_t>(shards_[i]->n_rows)),
 +                        static_cast<ssize_t>(batch.Size()));
 +          batch_allocation[i] = num_elems;
 +        }
 +      }
 +      shard_allocations.push_back(batch_allocation);
 +
 +      processed_size += batch.Size();
 +    }
     monitor_.StopCuda("Quantiles");
 +
 +    // Initialize hmat_ for the entire data set
 +    hmat_.Init(&sketch_container.sketches_, param_.max_bin);
 +
 +    n_bins_ = hmat_.row_ptr.back();
 +
     auto is_dense = info_->num_nonzero_ == info_->num_row_ * info_->num_col_;
 
 -    monitor_.StartCuda("BinningCompression");
 +    // Init global data for each shard
 +    monitor_.StartCuda("InitCompressedData");
     dh::ExecuteIndexShards(
         &shards_,
 -        [&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
 -          dh::safe_cuda(cudaSetDevice(shard->device_id));
 -          shard->InitCompressedData(hmat_, batch, is_dense);
 +        [&](int i, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
 +          dh::safe_cuda(cudaSetDevice(dist_.Devices().DeviceId(i)));
 +          shard->InitCompressedData(hmat_, row_stride, is_dense);
         });
 +    monitor_.StopCuda("InitCompressedData");
 +
 +    monitor_.StartCuda("BinningCompression");
 +    size_t batch_idx(0);
 +    for (const auto &batch : dmat->GetRowBatches()) {
 +      dh::ExecuteIndexShards(
 +          &shards_,
 +          [&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
 +            dh::safe_cuda(cudaSetDevice(shard->device_id));
 +            shard->CreateHistIndices(batch, idx, shard_allocations[batch_idx], hmat_.row_ptr.back());
 +          });
 +      batch_idx++;
 +    }
     monitor_.StopCuda("BinningCompression");
 -    ++batch_iter;
 -    CHECK(batch_iter.AtEnd()) << "External memory not supported";
 
     p_last_fmat_ = dmat;
     initialised_ = true;
 diff --git a/tests/cpp/common/test_gpu_hist_util.cu b/tests/cpp/common/test_gpu_hist_util.cu
 index 7c4fbd7..350ec42 100644
 --- a/tests/cpp/common/test_gpu_hist_util.cu
 +++ b/tests/cpp/common/test_gpu_hist_util.cu
 @@ -39,7 +39,16 @@ void TestDeviceSketch(const GPUSet& devices) {
   // find the cuts on the GPU
   const SparsePage& batch = *(*dmat)->GetRowBatches().begin();
   HistCutMatrix hmat_gpu;
 -  DeviceSketch(batch, (*dmat)->Info(), p, &hmat_gpu, gpu_batch_nrows);
 +  common::SketchContainer sketch_container;
 +  sketch_container.sketches_.resize((*dmat)->Info().num_col_);
 +  sketch_container.col_locks_.resize((*dmat)->Info().num_col_);
 +  for (int icol = 0; icol < (*dmat)->Info().num_col_; ++icol) {
 +    sketch_container.sketches_[icol].Init((*dmat)->Info().num_row_, 1.0 / (8 * p.max_bin));
 +    sketch_container.col_locks_[icol].reset(new std::atomic_flag(ATOMIC_FLAG_INIT));
 +  }
 +
 +  DeviceSketch(batch, (*dmat)->Info(), p, sketch_container, gpu_batch_nrows);
 +  hmat_gpu.Init(&sketch_container.sketches_, p.max_bin);
 
   // compare the cuts
   double eps = 1e-2;
 diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
 index 37cce8b..5804786 100644
 --- a/tests/cpp/tree/test_gpu_hist.cu
 +++ b/tests/cpp/tree/test_gpu_hist.cu
 @@ -77,7 +77,13 @@ void BuildGidx(DeviceShard<GradientSumT>* shard, int n_rows, int n_cols,
 
   auto is_dense = (*dmat)->Info().num_nonzero_ ==
                   (*dmat)->Info().num_row_ * (*dmat)->Info().num_col_;
 -  shard->InitCompressedData(cmat, batch, is_dense);
 +  size_t row_stride = 0;
 +  const auto &offset_vec = batch.offset.HostVector();
 +  for (size_t i = 1; i < offset_vec.size(); ++i) {
 +    row_stride = std::max(row_stride, offset_vec[i] - offset_vec[i-1]);
 +  }
 +  shard->InitCompressedData(cmat, row_stride, is_dense);
 +  shard->CreateHistIndices(batch, 0, std::vector<size_t>(1, batch.Size()), cmat.row_ptr.back());
 
   delete dmat;
 }
	diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu
	index b4cc797..1981264 100644
	--- a/src/common/hist_util.cu
	+++ b/src/common/hist_util.cu
	@@ -96,9 +96,8 @@ struct GPUSketcher {
	bool has_weights_{false};

	tree::TrainParam param_;
	- std::vector<WXQSketch> sketches_;
	+ SketchContainer &sketch_container_;
	thrust::device_vector<size_t> row_ptrs_;
	- std::vector<WXQSketch::SummaryContainer> summaries_;
	thrust::device_vector<Entry> entries_;
	thrust::device_vector<bst_float> fvalues_;
	thrust::device_vector<bst_float> feature_weights_;
	@@ -113,9 +112,9 @@ struct GPUSketcher {

	public:
	DeviceShard(int device, bst_uint row_begin, bst_uint row_end,
	- tree::TrainParam param) :
	+ tree::TrainParam param, SketchContainer &sketch_container) :
	device_(device), row_begin_(row_begin), row_end_(row_end),
	- n_rows_(row_end - row_begin), param_(std::move(param)) {
	+ n_rows_(row_end - row_begin), param_(std::move(param)), sketch_container_(sketch_container) {
	}

	void Init(const SparsePage& row_batch, const MetaInfo& info, int gpu_batch_nrows) {
	@@ -136,20 +135,10 @@ struct GPUSketcher {
	gpu_batch_nrows_ = n_rows_;
	}

	- // initialize sketches
	- sketches_.resize(num_cols_);
	- summaries_.resize(num_cols_);
	constexpr int kFactor = 8;
	- double eps = 1.0 / (kFactor * param_.max_bin);
	+ double eps = 1.0 / (kFactor * param_.max_bin);
	size_t dummy_nlevel;
	- WXQSketch::LimitSizeLevel(row_batch.Size(), eps, &dummy_nlevel, &n_cuts_);
	- // double ncuts to be the same as the number of values
	- // in the temporary buffers of the sketches
	- n_cuts_ *= 2;
	- for (int icol = 0; icol < num_cols_; ++icol) {
	- sketches_[icol].Init(row_batch.Size(), eps);
	- summaries_[icol].Reserve(n_cuts_);
	- }
	+ WXQSketch::LimitSizeLevel(gpu_batch_nrows_, eps, &dummy_nlevel, &n_cuts_);

	// allocate necessary GPU buffers
	dh::safe_cuda(cudaSetDevice(device_));
	@@ -306,9 +295,12 @@ struct GPUSketcher {

	// unpack the features; also unpack weights if present
	thrust::fill(fvalues_.begin(), fvalues_.end(), NAN);
	- thrust::fill(feature_weights_.begin(), feature_weights_.end(), NAN);
	+ if (has_weights_) {
	+ thrust::fill(feature_weights_.begin(), feature_weights_.end(), NAN);
	+ }

	- dim3 block3(64, 4, 1);
	+ dim3 block3(16, 64, 1);
	+ // NOTE: This will typically support ~ 4M features - 64K*64
	dim3 grid3(dh::DivRoundUp(batch_nrows, block3.x),
	dh::DivRoundUp(num_cols_, block3.y), 1);
	UnpackFeaturesK<<<grid3, block3>>>
	@@ -324,9 +316,15 @@ struct GPUSketcher {

	// add cuts into sketches
	thrust::copy(cuts_d_.begin(), cuts_d_.end(), cuts_h_.begin());
	+#pragma omp parallel for schedule(static)
	for (int icol = 0; icol < num_cols_; ++icol) {
	- summaries_[icol].MakeFromSorted(&cuts_h_[n_cuts_ * icol], n_cuts_cur_[icol]);
	- sketches_[icol].PushSummary(summaries_[icol]);
	+ WXQSketch::SummaryContainer summary;
	+ summary.Reserve(n_cuts_);
	+ summary.MakeFromSorted(&cuts_h_[n_cuts_ * icol], n_cuts_cur_[icol]);
	+ // Spin in user land until we we can grab this lock
	+ while (sketch_container_.col_locks_[icol]->test_and_set());
	+ sketch_container_.sketches_[icol].PushSummary(summary);
	+ sketch_container_.col_locks_[icol]->clear();
	}
	}

	@@ -342,14 +340,10 @@ struct GPUSketcher {
	SketchBatch(row_batch, info, gpu_batch);
	}
	}
	-
	- void GetSummary(WXQSketch::SummaryContainer *summary, size_t const icol) {
	- sketches_[icol].GetSummary(summary);
	- }
	};

	void Sketch(const SparsePage& batch, const MetaInfo& info,
	- HistCutMatrix* hmat, int gpu_batch_nrows) {
	+ SketchContainer &sketch_container, int gpu_batch_nrows) {
	// create device shards
	shards_.resize(dist_.Devices().Size());
	dh::ExecuteIndexShards(&shards_, [&](int i, std::unique_ptr<DeviceShard>& shard) {
	@@ -357,7 +351,7 @@ struct GPUSketcher {
	size_t size = dist_.ShardSize(info.num_row_, i);
	shard = std::unique_ptr<DeviceShard>(
	new DeviceShard(dist_.Devices().DeviceId(i),
	- start, start + size, param_));
	+ start, start + size, param_, sketch_container));
	});

	// compute sketches for each shard
	@@ -366,21 +360,6 @@ struct GPUSketcher {
	shard->Init(batch, info, gpu_batch_nrows);
	shard->Sketch(batch, info);
	});
	-
	- // merge the sketches from all shards
	- // TODO(canonizer): do it in a tree-like reduction
	- int num_cols = info.num_col_;
	- std::vector<WXQSketch> sketches(num_cols);
	- WXQSketch::SummaryContainer summary;
	- for (int icol = 0; icol < num_cols; ++icol) {
	- sketches[icol].Init(batch.Size(), 1.0 / (8 * param_.max_bin));
	- for (auto &shard : shards_) {
	- shard->GetSummary(&summary, icol);
	- sketches[icol].PushSummary(summary);
	- }
	- }
	-
	- hmat->Init(&sketches, param_.max_bin);
	}

	GPUSketcher(tree::TrainParam param, size_t n_rows) : param_(std::move(param)) {
	@@ -395,9 +374,10 @@ struct GPUSketcher {

	void DeviceSketch
	(const SparsePage& batch, const MetaInfo& info,
	- const tree::TrainParam& param, HistCutMatrix* hmat, int gpu_batch_nrows) {
	+ const tree::TrainParam& param, SketchContainer &sketch_container,
	+ int gpu_batch_nrows) {
	GPUSketcher sketcher(param, info.num_row_);
	- sketcher.Sketch(batch, info, hmat, gpu_batch_nrows);
	+ sketcher.Sketch(batch, info, sketch_container, gpu_batch_nrows);
	}

	} // namespace common
	diff --git a/src/common/hist_util.h b/src/common/hist_util.h
	index 3aa2f23..2d0d10a 100644
	--- a/src/common/hist_util.h
	+++ b/src/common/hist_util.h
	@@ -47,10 +47,24 @@ struct HistCutMatrix {
	Monitor monitor_;
	};

	+
	+/*!
	+ * \brief A container that holds the device sketches across all
	+ * sparse page batches which are distributed to different devices.
	+ * As sketches are aggregated by column, the atomic flag simulates
	+ * a lock in user land when multiple devices could be pushing
	+ * sketch summary for the same column across distinct rows.
	+ */
	+struct SketchContainer {
	+ std::vector<HistCutMatrix::WXQSketch> sketches_;
	+ std::vector<std::unique_ptr<std::atomic_flag>> col_locks_;
	+};
	+
	/! \brief Builds the cut matrix on the GPU /
	void DeviceSketch
	(const SparsePage& batch, const MetaInfo& info,
	- const tree::TrainParam& param, HistCutMatrix* hmat, int gpu_batch_nrows);
	+ const tree::TrainParam& param, SketchContainer &sketches,
	+ int gpu_batch_nrows);

	/*!
	* \brief A single row in global histogram index.
	diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
	index e93ca9a..573d17d 100644
	--- a/src/tree/updater_gpu_hist.cu
	+++ b/src/tree/updater_gpu_hist.cu
	@@ -164,8 +164,10 @@ struct ELLPackMatrix {
	}
	void Init(common::Span<uint32_t> feature_segments,
	common::Span<bst_float> min_fvalue,
	- common::Span<bst_float> gidx_fvalue_map, size_t row_stride,
	- common::CompressedIterator<uint32_t> gidx_iter, bool is_dense,
	+ common::Span<bst_float> gidx_fvalue_map,
	+ size_t row_stride,
	+ common::CompressedIterator<uint32_t> gidx_iter,
	+ bool is_dense,
	int null_gidx_value) {
	this->feature_segments = feature_segments;
	this->min_fvalue = min_fvalue;
	@@ -646,8 +648,6 @@ struct DeviceShard {
	/! \brief Sum gradient for each node. /
	std::vector<GradientPair> node_sum_gradients;
	common::Span<GradientPair> node_sum_gradients_d;
	- /! \brief row offset in SparsePage (the input data). /
	- thrust::device_vector<size_t> row_ptrs;
	/! \brief On-device feature set, only actually used on one of the devices /
	thrust::device_vector<int> feature_set_d;
	thrust::device_vector<int64_t>
	@@ -656,6 +656,7 @@ struct DeviceShard {
	bst_uint row_begin_idx;
	bst_uint row_end_idx;
	bst_uint n_rows;
	+ bst_uint rows_processed;

	TrainParam param;
	bool prediction_cache_initialised;
	@@ -674,36 +675,17 @@ struct DeviceShard {
	row_begin_idx(row_begin),
	row_end_idx(row_end),
	n_rows(row_end - row_begin),
	+ rows_processed(0),
	n_bins(0),
	param(std::move(_param)),
	prediction_cache_initialised(false) {}

	- /* Init row_ptrs and row_stride */
	- size_t InitRowPtrs(const SparsePage& row_batch) {
	- const auto& offset_vec = row_batch.offset.HostVector();
	- row_ptrs.resize(n_rows + 1);
	- thrust::copy(offset_vec.data() + row_begin_idx,
	- offset_vec.data() + row_end_idx + 1,
	- row_ptrs.begin());
	- auto row_iter = row_ptrs.begin();
	- // find the maximum row size for converting to ELLPack
	- auto get_size = [=] __device__(size_t row) {
	- return row_iter[row + 1] - row_iter[row];
	- }; // NOLINT
	-
	- auto counting = thrust::make_counting_iterator(size_t(0));
	- using TransformT = thrust::transform_iterator<decltype(get_size),
	- decltype(counting), size_t>;
	- TransformT row_size_iter = TransformT(counting, get_size);
	- size_t row_stride = thrust::reduce(row_size_iter, row_size_iter + n_rows, 0,
	- thrust::maximum<size_t>());
	- return row_stride;
	- }
	-
	void InitCompressedData(
	- const common::HistCutMatrix& hmat, const SparsePage& row_batch, bool is_dense);
	+ const common::HistCutMatrix& hmat, size_t row_stride, bool is_dense);

	- void CreateHistIndices(const SparsePage& row_batch, size_t row_stride, int null_gidx_value);
	+ void CreateHistIndices(
	+ const SparsePage& row_batch, size_t shard_idx,
	+ const std::vector<size_t> &shard_allocations, int null_gidx_value);

	~DeviceShard() {
	dh::safe_cuda(cudaSetDevice(device_id));
	@@ -1045,11 +1027,13 @@ struct GlobalMemHistBuilder : public GPUHistBuilderBase<GradientSumT> {

	template <typename GradientSumT>
	inline void DeviceShard<GradientSumT>::InitCompressedData(
	- const common::HistCutMatrix& hmat, const SparsePage& row_batch, bool is_dense) {
	- size_t row_stride = this->InitRowPtrs(row_batch);
	+ const common::HistCutMatrix& hmat, size_t row_stride, bool is_dense) {
	n_bins = hmat.row_ptr.back();
	int null_gidx_value = hmat.row_ptr.back();

	+ CHECK(!(param.max_leaves == 0 && param.max_depth == 0))
	+ << "Max leaves and max depth cannot both be unconstrained for "
	+ "gpu_hist.";
	int max_nodes =
	param.max_leaves > 0 ? param.max_leaves * 2 : MaxNodesDepth(param.max_depth);

	@@ -1072,24 +1056,17 @@ inline void DeviceShard<GradientSumT>::InitCompressedData(
	node_sum_gradients.resize(max_nodes);
	ridx_segments.resize(max_nodes);

	-
	// allocate compressed bin data
	int num_symbols = n_bins + 1;
	// Required buffer size for storing data matrix in ELLPack format.
	size_t compressed_size_bytes =
	common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows,
	num_symbols);
	-
	- CHECK(!(param.max_leaves == 0 && param.max_depth == 0))
	- << "Max leaves and max depth cannot both be unconstrained for "
	- "gpu_hist.";
	ba.Allocate(device_id, &gidx_buffer, compressed_size_bytes);
	thrust::fill(
	thrust::device_pointer_cast(gidx_buffer.data()),
	thrust::device_pointer_cast(gidx_buffer.data() + gidx_buffer.size()), 0);

	- this->CreateHistIndices(row_batch, row_stride, null_gidx_value);
	-
	ellpack_matrix.Init(
	feature_segments, min_fvalue,
	gidx_fvalue_map, row_stride,
	@@ -1113,23 +1090,42 @@ inline void DeviceShard<GradientSumT>::InitCompressedData(

	template <typename GradientSumT>
	inline void DeviceShard<GradientSumT>::CreateHistIndices(
	- const SparsePage& row_batch, size_t row_stride, int null_gidx_value) {
	+ const SparsePage& row_batch,
	+ size_t shard_idx,
	+ const std::vector<size_t> &shard_allocations,
	+ int null_gidx_value) {
	+ // Has any been allocated for me in this batch?
	+ size_t num_elems = shard_allocations[shard_idx];
	+ if (!num_elems) return;
	+
	+ size_t row_stride = this->ellpack_matrix.row_stride;
	+
	+ // Take into account the rows that could be processed by other GPUs
	+ size_t begin_idx(0);
	+ for (size_t i = 0; i < shard_idx; ++i) begin_idx += shard_allocations[i];
	+
	+ const auto& offset_vec = row_batch.offset.HostVector();
	+ /! \brief row offset in SparsePage (the input data). /
	+ thrust::device_vector<size_t> row_ptrs(num_elems+1);
	+ thrust::copy(offset_vec.data() + begin_idx,
	+ offset_vec.data() + begin_idx + num_elems + 1,
	+ row_ptrs.begin());
	+
	int num_symbols = n_bins + 1;
	// bin and compress entries in batches of rows
	size_t gpu_batch_nrows =
	std::min
	(dh::TotalMemory(device_id) / (16 * row_stride * sizeof(Entry)),
	- static_cast<size_t>(n_rows));
	+ static_cast<size_t>(num_elems));
	const std::vector<Entry>& data_vec = row_batch.data.HostVector();

	thrust::device_vector<Entry> entries_d(gpu_batch_nrows * row_stride);
	- size_t gpu_nbatches = dh::DivRoundUp(n_rows, gpu_batch_nrows);
	-
	+ size_t gpu_nbatches = dh::DivRoundUp(num_elems, gpu_batch_nrows);
	for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) {
	size_t batch_row_begin = gpu_batch * gpu_batch_nrows;
	size_t batch_row_end = (gpu_batch + 1) * gpu_batch_nrows;
	- if (batch_row_end > n_rows) {
	- batch_row_end = n_rows;
	+ if (batch_row_end > num_elems) {
	+ batch_row_end = num_elems;
	}
	size_t batch_nrows = batch_row_end - batch_row_begin;
	// number of entries in this batch.
	@@ -1140,19 +1136,25 @@ inline void DeviceShard<GradientSumT>::CreateHistIndices(
	(entries_d.data().get(), data_vec.data() + row_ptrs[batch_row_begin],
	n_entries * sizeof(Entry), cudaMemcpyDefault));
	const dim3 block3(32, 8, 1); // 256 threads
	- const dim3 grid3(dh::DivRoundUp(n_rows, block3.x),
	+ const dim3 grid3(dh::DivRoundUp(num_elems, block3.x),
	dh::DivRoundUp(row_stride, block3.y), 1);
	CompressBinEllpackKernel<<<grid3, block3>>>
	(common::CompressedBufferWriter(num_symbols),
	gidx_buffer.data(),
	row_ptrs.data().get() + batch_row_begin,
	entries_d.data().get(),
	- gidx_fvalue_map.data(), feature_segments.data(),
	- batch_row_begin, batch_nrows,
	+ gidx_fvalue_map.data(),
	+ feature_segments.data(),
	+ rows_processed + batch_row_begin,
	+ batch_nrows,
	row_ptrs[batch_row_begin],
	- row_stride, null_gidx_value);
	+ row_stride,
	+ null_gidx_value);
	}

	+ // This will be the offset into the gidx_buffer for the next batch
	+ rows_processed += num_elems;
	+
	// free the memory that is no longer needed
	row_ptrs.resize(0);
	row_ptrs.shrink_to_fit();
	@@ -1216,8 +1218,6 @@ class GPUHistMakerSpecialised{

	reducer_.Init(device_list_);

	- auto batch_iter = dmat->GetRowBatches().begin();
	- const SparsePage& batch = *batch_iter;
	// Create device shards
	shards_.resize(n_devices);
	dh::ExecuteIndexShards(
	@@ -1231,23 +1231,99 @@ class GPUHistMakerSpecialised{
	start + size, param_));
	});

	- // Find the cuts.
	+ // Initialize Sketches across all batches
	+ common::SketchContainer sketch_container;
	+ sketch_container.sketches_.resize(info_->num_col_);
	+ sketch_container.col_locks_.resize(info_->num_col_);
	+#pragma omp parallel for schedule(static)
	+ for (int icol = 0; icol < info_->num_col_; ++icol) {
	+ sketch_container.sketches_[icol].Init(info_->num_row_, 1.0 / (8 * param_.max_bin));
	+ sketch_container.col_locks_[icol].reset(new std::atomic_flag(ATOMIC_FLAG_INIT));
	+ }
	+
	+ size_t row_stride(0); // Max row_stride across the entire dataset
	+ size_t processed_size(0);
	+
	+ // Rows allocated to each GPU in a batch
	+ std::vector<std::vector<size_t>> shard_allocations;
	monitor_.StartCuda("Quantiles");
	- common::DeviceSketch(batch, *info_, param_, &hmat_, hist_maker_param_.gpu_batch_nrows);
	- n_bins_ = hmat_.row_ptr.back();
	+ for (const auto &batch : dmat->GetRowBatches()) {
	+ struct RowReset {
	+ public:
	+ RowReset(MetaInfo &minfo, const SparsePage &r_batch)
	+ : minfo_(minfo) {
	+ orig_nrows_ = minfo_.num_row_;
	+ minfo.num_row_ = r_batch.Size();
	+ }
	+
	+ ~RowReset() {
	+ minfo_.num_row_ = orig_nrows_;
	+ }
	+ private:
	+ MetaInfo &minfo_;
	+ size_t orig_nrows_;
	+ }rowgd(*info_, batch);
	+
	+ // Find the cuts for each batch
	+ common::DeviceSketch(batch, *info_, param_, sketch_container, hist_maker_param_.gpu_batch_nrows);
	+
	+ const auto &offset_vec = batch.offset.HostVector();
	+ for (size_t i = 1; i < offset_vec.size(); ++i) {
	+ row_stride = std::max(row_stride, offset_vec[i] - offset_vec[i-1]);
	+ }
	+
	+ std::vector<size_t> batch_allocation(shards_.size());
	+ // Distribute the rows in this batch to the different shards
	+ for (size_t i = 0; i < shards_.size(); ++i) {
	+ // Does this batch pertain to me?
	+ ssize_t num_elems = processed_size + batch.Size() - shards_[i]->row_begin_idx;
	+ ssize_t rem_elems = shards_[i]->row_end_idx - processed_size;
	+ if (num_elems <= 0 \|\| rem_elems <= 0) batch_allocation[i] = 0;
	+ // How many elements do I process from this batch?
	+ else {
	+ num_elems = std::min(
	+ std::min(
	+ std::min(num_elems, rem_elems),
	+ static_cast<ssize_t>(shards_[i]->n_rows)),
	+ static_cast<ssize_t>(batch.Size()));
	+ batch_allocation[i] = num_elems;
	+ }
	+ }
	+ shard_allocations.push_back(batch_allocation);
	+
	+ processed_size += batch.Size();
	+ }
	monitor_.StopCuda("Quantiles");
	+
	+ // Initialize hmat_ for the entire data set
	+ hmat_.Init(&sketch_container.sketches_, param_.max_bin);
	+
	+ n_bins_ = hmat_.row_ptr.back();
	+
	auto is_dense = info_->num_nonzero_ == info_->num_row_ * info_->num_col_;

	- monitor_.StartCuda("BinningCompression");
	+ // Init global data for each shard
	+ monitor_.StartCuda("InitCompressedData");
	dh::ExecuteIndexShards(
	&shards_,
	- [&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
	- dh::safe_cuda(cudaSetDevice(shard->device_id));
	- shard->InitCompressedData(hmat_, batch, is_dense);
	+ [&](int i, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
	+ dh::safe_cuda(cudaSetDevice(dist_.Devices().DeviceId(i)));
	+ shard->InitCompressedData(hmat_, row_stride, is_dense);
	});
	+ monitor_.StopCuda("InitCompressedData");
	+
	+ monitor_.StartCuda("BinningCompression");
	+ size_t batch_idx(0);
	+ for (const auto &batch : dmat->GetRowBatches()) {
	+ dh::ExecuteIndexShards(
	+ &shards_,
	+ [&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
	+ dh::safe_cuda(cudaSetDevice(shard->device_id));
	+ shard->CreateHistIndices(batch, idx, shard_allocations[batch_idx], hmat_.row_ptr.back());
	+ });
	+ batch_idx++;
	+ }
	monitor_.StopCuda("BinningCompression");
	- ++batch_iter;
	- CHECK(batch_iter.AtEnd()) << "External memory not supported";

	p_last_fmat_ = dmat;
	initialised_ = true;
	diff --git a/tests/cpp/common/test_gpu_hist_util.cu b/tests/cpp/common/test_gpu_hist_util.cu
	index 7c4fbd7..350ec42 100644
	--- a/tests/cpp/common/test_gpu_hist_util.cu
	+++ b/tests/cpp/common/test_gpu_hist_util.cu
	@@ -39,7 +39,16 @@ void TestDeviceSketch(const GPUSet& devices) {
	// find the cuts on the GPU
	const SparsePage& batch = (dmat)->GetRowBatches().begin();
	HistCutMatrix hmat_gpu;
	- DeviceSketch(batch, (*dmat)->Info(), p, &hmat_gpu, gpu_batch_nrows);
	+ common::SketchContainer sketch_container;
	+ sketch_container.sketches_.resize((*dmat)->Info().num_col_);
	+ sketch_container.col_locks_.resize((*dmat)->Info().num_col_);
	+ for (int icol = 0; icol < (*dmat)->Info().num_col_; ++icol) {
	+ sketch_container.sketches_[icol].Init((dmat)->Info().num_row_, 1.0 / (8 p.max_bin));
	+ sketch_container.col_locks_[icol].reset(new std::atomic_flag(ATOMIC_FLAG_INIT));
	+ }
	+
	+ DeviceSketch(batch, (*dmat)->Info(), p, sketch_container, gpu_batch_nrows);
	+ hmat_gpu.Init(&sketch_container.sketches_, p.max_bin);

	// compare the cuts
	double eps = 1e-2;
	diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
	index 37cce8b..5804786 100644
	--- a/tests/cpp/tree/test_gpu_hist.cu
	+++ b/tests/cpp/tree/test_gpu_hist.cu
	@@ -77,7 +77,13 @@ void BuildGidx(DeviceShard<GradientSumT>* shard, int n_rows, int n_cols,

	auto is_dense = (*dmat)->Info().num_nonzero_ ==
	(dmat)->Info().num_row_ (*dmat)->Info().num_col_;
	- shard->InitCompressedData(cmat, batch, is_dense);
	+ size_t row_stride = 0;
	+ const auto &offset_vec = batch.offset.HostVector();
	+ for (size_t i = 1; i < offset_vec.size(); ++i) {
	+ row_stride = std::max(row_stride, offset_vec[i] - offset_vec[i-1]);
	+ }
	+ shard->InitCompressedData(cmat, row_stride, is_dense);
	+ shard->CreateHistIndices(batch, 0, std::vector<size_t>(1, batch.Size()), cmat.row_ptr.back());

	delete dmat;
	}
No results found