Skip to content

Instantly share code, notes, and snippets.

@sriramch
Created April 17, 2019 15:52
Show Gist options
  • Select an option

  • Save sriramch/72986f49b74f379282f5d13776f17cc6 to your computer and use it in GitHub Desktop.

Select an option

Save sriramch/72986f49b74f379282f5d13776f17cc6 to your computer and use it in GitHub Desktop.
mgpu external memory predictor patch
diff --git a/src/common/host_device_vector.h b/src/common/host_device_vector.h
index 425cbff..4cb1677 100644
--- a/src/common/host_device_vector.h
+++ b/src/common/host_device_vector.h
@@ -132,7 +132,6 @@ class GPUDistribution {
if (size == 0) { return 0; }
if (offsets_.size() > 0) {
// explicit offsets are provided
- CHECK_EQ(offsets_.back(), size);
return offsets_.at(index);
}
// no explicit offsets
@@ -145,7 +144,7 @@ class GPUDistribution {
if (size == 0) { return 0; }
if (offsets_.size() > 0) {
// explicit offsets are provided
- CHECK_EQ(offsets_.back(), size);
+ if (!offsets_.at(index+1)) return 0;
return offsets_.at(index + 1) - offsets_.at(index) +
(index == devices_.Size() - 1 ? overlap_ : 0);
}
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 39b5c87..639b977 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -178,7 +178,6 @@ __device__ float GetLeafWeight(bst_uint ridx, const DevicePredictionNode* tree,
return n.GetWeight();
}
-template <int BLOCK_THREADS>
__global__ void PredictKernel(common::Span<const DevicePredictionNode> d_nodes,
common::Span<float> d_out_predictions,
common::Span<size_t> d_tree_segments,
@@ -231,9 +230,16 @@ class GPUPredictor : public xgboost::Predictor {
auto data_span = data.DeviceSpan(device);
dh::safe_cuda(cudaSetDevice(device));
// copy the last element from every shard
- dh::safe_cuda(cudaMemcpy(&offsets.at(shard + 1),
- &data_span[data_span.size()-1],
- sizeof(size_t), cudaMemcpyDeviceToHost));
+ if (!data_span.size()) {
+ size_t zero_size = 0;
+ dh::safe_cuda(cudaMemcpy(&offsets.at(shard + 1),
+ &zero_size,
+ sizeof(size_t), cudaMemcpyHostToHost));
+ } else {
+ dh::safe_cuda(cudaMemcpy(&offsets.at(shard + 1),
+ &data_span[data_span.size()-1],
+ sizeof(size_t), cudaMemcpyDeviceToHost));
+ }
}
}
@@ -267,7 +273,7 @@ class GPUPredictor : public xgboost::Predictor {
cudaMemcpyHostToDevice));
const int BLOCK_THREADS = 128;
- size_t num_rows = batch.offset.DeviceSize(device_) - 1;
+ ssize_t num_rows = batch.offset.DeviceSize(device_) - 1;
if (num_rows < 1) { return; }
const int GRID_SIZE = static_cast<int>(dh::DivRoundUp(num_rows, BLOCK_THREADS));
@@ -283,8 +289,10 @@ class GPUPredictor : public xgboost::Predictor {
size_t entry_start = data_distr.ShardStart(batch.data.Size(),
data_distr.Devices().Index(device_));
- PredictKernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS, shared_memory_bytes>>>
- (dh::ToSpan(nodes_), predictions->DeviceSpan(device_), dh::ToSpan(tree_segments_),
+ PredictKernel<<<GRID_SIZE, BLOCK_THREADS, shared_memory_bytes>>>
+ (dh::ToSpan(nodes_),
+ predictions->DeviceSpan(device_),
+ dh::ToSpan(tree_segments_),
dh::ToSpan(tree_group_), batch.offset.DeviceSpan(device_),
batch.data.DeviceSpan(device_), tree_begin, tree_end, info.num_col_,
num_rows, entry_start, use_shared, model.param.num_output_group);
@@ -323,20 +331,59 @@ class GPUPredictor : public xgboost::Predictor {
h_nodes.begin() + h_tree_segments[tree_idx - tree_begin]);
}
- size_t i_batch = 0;
+ // Accumulate all predictions from multiple batches into this
+ std::vector<float> &hvec = out_preds->HostVector();
+ std::vector<float>::iterator hitr = hvec.begin();
for (const auto &batch : dmat->GetRowBatches()) {
- CHECK_EQ(i_batch, 0) << "External memory not supported";
// out_preds have been resharded and resized in InitOutPredictions()
batch.offset.Reshard(GPUDistribution::Overlap(devices_, 1));
+
std::vector<size_t> device_offsets;
DeviceOffsets(batch.offset, &device_offsets);
+
batch.data.Reshard(GPUDistribution::Explicit(devices_, device_offsets));
+
+ // Gather the batch predictions; but, first reset the number of rows
+ // to what is present in the batch. The local batch prediction should
+ // be resharded based on the number of devices present already
+ struct RowReset {
+ public:
+ RowReset(MetaInfo &minfo, const SparsePage &r_batch)
+ : minfo_(minfo) {
+ orig_nrows_ = minfo_.num_row_;
+ minfo.num_row_ = r_batch.Size();
+ }
+
+ ~RowReset() {
+ minfo_.num_row_ = orig_nrows_;
+ }
+ private:
+ MetaInfo &minfo_;
+ size_t orig_nrows_;
+ }rowgd(dmat->Info(), batch);
+
+ HostDeviceVector<bst_float> batchPreds;
+ this->InitOutPredictions(dmat->Info(), &batchPreds, model);
+ // Starting from hitr copy batch size * model.param.num_output_group
+ // that were previously present into batchPreds
+ std::copy(hitr, hitr + (batch.Size() * model.param.num_output_group), batchPreds.HostVector().begin());
+
dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
- shard.PredictInternal(batch, dmat->Info(), out_preds, model,
+ shard.PredictInternal(batch, dmat->Info(), &batchPreds, model,
h_tree_segments, h_nodes, tree_begin, tree_end);
});
- i_batch++;
+
+ // Copy the batch predictions to the global prediction array
+ std::copy(batchPreds.ConstHostVector().begin(), batchPreds.ConstHostVector().end(), hitr);
+ std::advance(hitr, batchPreds.Size());
+
+ // Sparse pages can be reused by the threaded iterator when there are a number
+ // of them. Thus, for the Reshard to work in future pages, we empty out its
+ // current distribution, as Reshard expect atleast the src/target distribution
+ // to be empty
+ batch.offset.Reshard(GPUDistribution());
+ batch.data.Reshard(GPUDistribution());
}
monitor_.StopCuda("DevicePredictInternal");
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment