Created
April 17, 2019 15:52
-
-
Save sriramch/72986f49b74f379282f5d13776f17cc6 to your computer and use it in GitHub Desktop.
mgpu external memory predictor patch
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| diff --git a/src/common/host_device_vector.h b/src/common/host_device_vector.h | |
| index 425cbff..4cb1677 100644 | |
| --- a/src/common/host_device_vector.h | |
| +++ b/src/common/host_device_vector.h | |
| @@ -132,7 +132,6 @@ class GPUDistribution { | |
| if (size == 0) { return 0; } | |
| if (offsets_.size() > 0) { | |
| // explicit offsets are provided | |
| - CHECK_EQ(offsets_.back(), size); | |
| return offsets_.at(index); | |
| } | |
| // no explicit offsets | |
| @@ -145,7 +144,7 @@ class GPUDistribution { | |
| if (size == 0) { return 0; } | |
| if (offsets_.size() > 0) { | |
| // explicit offsets are provided | |
| - CHECK_EQ(offsets_.back(), size); | |
| + if (!offsets_.at(index+1)) return 0; | |
| return offsets_.at(index + 1) - offsets_.at(index) + | |
| (index == devices_.Size() - 1 ? overlap_ : 0); | |
| } | |
| diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu | |
| index 39b5c87..639b977 100644 | |
| --- a/src/predictor/gpu_predictor.cu | |
| +++ b/src/predictor/gpu_predictor.cu | |
| @@ -178,7 +178,6 @@ __device__ float GetLeafWeight(bst_uint ridx, const DevicePredictionNode* tree, | |
| return n.GetWeight(); | |
| } | |
| -template <int BLOCK_THREADS> | |
| __global__ void PredictKernel(common::Span<const DevicePredictionNode> d_nodes, | |
| common::Span<float> d_out_predictions, | |
| common::Span<size_t> d_tree_segments, | |
| @@ -231,9 +230,16 @@ class GPUPredictor : public xgboost::Predictor { | |
| auto data_span = data.DeviceSpan(device); | |
| dh::safe_cuda(cudaSetDevice(device)); | |
| // copy the last element from every shard | |
| - dh::safe_cuda(cudaMemcpy(&offsets.at(shard + 1), | |
| - &data_span[data_span.size()-1], | |
| - sizeof(size_t), cudaMemcpyDeviceToHost)); | |
| + if (!data_span.size()) { | |
| + size_t zero_size = 0; | |
| + dh::safe_cuda(cudaMemcpy(&offsets.at(shard + 1), | |
| + &zero_size, | |
| + sizeof(size_t), cudaMemcpyHostToHost)); | |
| + } else { | |
| + dh::safe_cuda(cudaMemcpy(&offsets.at(shard + 1), | |
| + &data_span[data_span.size()-1], | |
| + sizeof(size_t), cudaMemcpyDeviceToHost)); | |
| + } | |
| } | |
| } | |
| @@ -267,7 +273,7 @@ class GPUPredictor : public xgboost::Predictor { | |
| cudaMemcpyHostToDevice)); | |
| const int BLOCK_THREADS = 128; | |
| - size_t num_rows = batch.offset.DeviceSize(device_) - 1; | |
| + ssize_t num_rows = batch.offset.DeviceSize(device_) - 1; | |
| if (num_rows < 1) { return; } | |
| const int GRID_SIZE = static_cast<int>(dh::DivRoundUp(num_rows, BLOCK_THREADS)); | |
| @@ -283,8 +289,10 @@ class GPUPredictor : public xgboost::Predictor { | |
| size_t entry_start = data_distr.ShardStart(batch.data.Size(), | |
| data_distr.Devices().Index(device_)); | |
| - PredictKernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS, shared_memory_bytes>>> | |
| - (dh::ToSpan(nodes_), predictions->DeviceSpan(device_), dh::ToSpan(tree_segments_), | |
| + PredictKernel<<<GRID_SIZE, BLOCK_THREADS, shared_memory_bytes>>> | |
| + (dh::ToSpan(nodes_), | |
| + predictions->DeviceSpan(device_), | |
| + dh::ToSpan(tree_segments_), | |
| dh::ToSpan(tree_group_), batch.offset.DeviceSpan(device_), | |
| batch.data.DeviceSpan(device_), tree_begin, tree_end, info.num_col_, | |
| num_rows, entry_start, use_shared, model.param.num_output_group); | |
| @@ -323,20 +331,59 @@ class GPUPredictor : public xgboost::Predictor { | |
| h_nodes.begin() + h_tree_segments[tree_idx - tree_begin]); | |
| } | |
| - size_t i_batch = 0; | |
| + // Accumulate all predictions from multiple batches into this | |
| + std::vector<float> &hvec = out_preds->HostVector(); | |
| + std::vector<float>::iterator hitr = hvec.begin(); | |
| for (const auto &batch : dmat->GetRowBatches()) { | |
| - CHECK_EQ(i_batch, 0) << "External memory not supported"; | |
| // out_preds have been resharded and resized in InitOutPredictions() | |
| batch.offset.Reshard(GPUDistribution::Overlap(devices_, 1)); | |
| + | |
| std::vector<size_t> device_offsets; | |
| DeviceOffsets(batch.offset, &device_offsets); | |
| + | |
| batch.data.Reshard(GPUDistribution::Explicit(devices_, device_offsets)); | |
| + | |
| + // Gather the batch predictions; but, first reset the number of rows | |
| + // to what is present in the batch. The local batch prediction should | |
| + // be resharded based on the number of devices present already | |
| + struct RowReset { | |
| + public: | |
| + RowReset(MetaInfo &minfo, const SparsePage &r_batch) | |
| + : minfo_(minfo) { | |
| + orig_nrows_ = minfo_.num_row_; | |
| + minfo.num_row_ = r_batch.Size(); | |
| + } | |
| + | |
| + ~RowReset() { | |
| + minfo_.num_row_ = orig_nrows_; | |
| + } | |
| + private: | |
| + MetaInfo &minfo_; | |
| + size_t orig_nrows_; | |
| + }rowgd(dmat->Info(), batch); | |
| + | |
| + HostDeviceVector<bst_float> batchPreds; | |
| + this->InitOutPredictions(dmat->Info(), &batchPreds, model); | |
| + // Starting from hitr copy batch size * model.param.num_output_group | |
| + // that were previously present into batchPreds | |
| + std::copy(hitr, hitr + (batch.Size() * model.param.num_output_group), batchPreds.HostVector().begin()); | |
| + | |
| dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) { | |
| - shard.PredictInternal(batch, dmat->Info(), out_preds, model, | |
| + shard.PredictInternal(batch, dmat->Info(), &batchPreds, model, | |
| h_tree_segments, h_nodes, tree_begin, tree_end); | |
| }); | |
| - i_batch++; | |
| + | |
| + // Copy the batch predictions to the global prediction array | |
| + std::copy(batchPreds.ConstHostVector().begin(), batchPreds.ConstHostVector().end(), hitr); | |
| + std::advance(hitr, batchPreds.Size()); | |
| + | |
| + // Sparse pages can be reused by the threaded iterator when there are a number | |
| + // of them. Thus, for the Reshard to work in future pages, we empty out its | |
| + // current distribution, as Reshard expect atleast the src/target distribution | |
| + // to be empty | |
| + batch.offset.Reshard(GPUDistribution()); | |
| + batch.data.Reshard(GPUDistribution()); | |
| } | |
| monitor_.StopCuda("DevicePredictInternal"); | |
| } | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment