sriramch · April 17, 2019 15:52
diff --git a/mgpu_ext_mem_predictor b/mgpu_ext_mem_predictor
 diff --git a/src/common/host_device_vector.h b/src/common/host_device_vector.h
 index 425cbff..4cb1677 100644
 --- a/src/common/host_device_vector.h
 +++ b/src/common/host_device_vector.h
 @@ -132,7 +132,6 @@ class GPUDistribution {
     if (size == 0) { return 0; }
     if (offsets_.size() > 0) {
       // explicit offsets are provided
 -      CHECK_EQ(offsets_.back(), size);
       return offsets_.at(index);
     }
     // no explicit offsets
 @@ -145,7 +144,7 @@ class GPUDistribution {
     if (size == 0) { return 0; }
     if (offsets_.size() > 0) {
       // explicit offsets are provided
 -      CHECK_EQ(offsets_.back(), size);
 +      if (!offsets_.at(index+1)) return 0;
       return offsets_.at(index + 1)  - offsets_.at(index) +
         (index == devices_.Size() - 1 ? overlap_ : 0);
     }
 diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
 index 39b5c87..639b977 100644
 --- a/src/predictor/gpu_predictor.cu
 +++ b/src/predictor/gpu_predictor.cu
 @@ -178,7 +178,6 @@ __device__ float GetLeafWeight(bst_uint ridx, const DevicePredictionNode* tree,
   return n.GetWeight();
 }
 
 -template <int BLOCK_THREADS>
 __global__ void PredictKernel(common::Span<const DevicePredictionNode> d_nodes,
                               common::Span<float> d_out_predictions,
                               common::Span<size_t> d_tree_segments,
 @@ -231,9 +230,16 @@ class GPUPredictor : public xgboost::Predictor {
       auto data_span = data.DeviceSpan(device);
       dh::safe_cuda(cudaSetDevice(device));
       // copy the last element from every shard
 -      dh::safe_cuda(cudaMemcpy(&offsets.at(shard + 1),
 -                               &data_span[data_span.size()-1],
 -                               sizeof(size_t), cudaMemcpyDeviceToHost));
 +      if (!data_span.size()) {
 +         size_t zero_size = 0;
 +         dh::safe_cuda(cudaMemcpy(&offsets.at(shard + 1),
 +                                  &zero_size,
 +                                  sizeof(size_t), cudaMemcpyHostToHost));
 +      } else {
 +         dh::safe_cuda(cudaMemcpy(&offsets.at(shard + 1),
 +                                  &data_span[data_span.size()-1],
 +                                  sizeof(size_t), cudaMemcpyDeviceToHost));
 +      }
     }
   }
 
 @@ -267,7 +273,7 @@ class GPUPredictor : public xgboost::Predictor {
                                     cudaMemcpyHostToDevice));
 
       const int BLOCK_THREADS = 128;
 -      size_t num_rows = batch.offset.DeviceSize(device_) - 1;
 +      ssize_t num_rows = batch.offset.DeviceSize(device_) - 1;
       if (num_rows < 1) { return; }
 
       const int GRID_SIZE = static_cast<int>(dh::DivRoundUp(num_rows, BLOCK_THREADS));
 @@ -283,8 +289,10 @@ class GPUPredictor : public xgboost::Predictor {
       size_t entry_start = data_distr.ShardStart(batch.data.Size(),
                                                  data_distr.Devices().Index(device_));
 
 -      PredictKernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS, shared_memory_bytes>>>
 -        (dh::ToSpan(nodes_), predictions->DeviceSpan(device_), dh::ToSpan(tree_segments_),
 +      PredictKernel<<<GRID_SIZE, BLOCK_THREADS, shared_memory_bytes>>>
 +        (dh::ToSpan(nodes_),
 +         predictions->DeviceSpan(device_),
 +         dh::ToSpan(tree_segments_),
          dh::ToSpan(tree_group_), batch.offset.DeviceSpan(device_),
          batch.data.DeviceSpan(device_), tree_begin, tree_end, info.num_col_,
          num_rows, entry_start, use_shared, model.param.num_output_group);
 @@ -323,20 +331,59 @@ class GPUPredictor : public xgboost::Predictor {
                 h_nodes.begin() + h_tree_segments[tree_idx - tree_begin]);
     }
 
 -    size_t i_batch = 0;
 +    // Accumulate all predictions from multiple batches into this
 +    std::vector<float> &hvec = out_preds->HostVector();
 +    std::vector<float>::iterator hitr = hvec.begin();
 
     for (const auto &batch : dmat->GetRowBatches()) {
 -      CHECK_EQ(i_batch, 0) << "External memory not supported";
       // out_preds have been resharded and resized in InitOutPredictions()
       batch.offset.Reshard(GPUDistribution::Overlap(devices_, 1));
 +
       std::vector<size_t> device_offsets;
       DeviceOffsets(batch.offset, &device_offsets);
 +
       batch.data.Reshard(GPUDistribution::Explicit(devices_, device_offsets));
 +
 +      // Gather the batch predictions; but, first reset the number of rows
 +      // to what is present in the batch. The local batch prediction should
 +      // be resharded based on the number of devices present already
 +      struct RowReset {
 +         public:
 +            RowReset(MetaInfo &minfo, const SparsePage &r_batch)
 +               : minfo_(minfo) {
 +               orig_nrows_ = minfo_.num_row_;
 +               minfo.num_row_ = r_batch.Size();
 +            }
 +
 +            ~RowReset() {
 +               minfo_.num_row_ = orig_nrows_;
 +            }
 +         private:
 +            MetaInfo &minfo_;
 +            size_t orig_nrows_;
 +      }rowgd(dmat->Info(), batch);
 +
 +      HostDeviceVector<bst_float> batchPreds;
 +      this->InitOutPredictions(dmat->Info(), &batchPreds, model);
 +      // Starting from hitr copy batch size * model.param.num_output_group
 +      // that were previously present into batchPreds
 +      std::copy(hitr, hitr + (batch.Size() * model.param.num_output_group), batchPreds.HostVector().begin());
 +
       dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
 -        shard.PredictInternal(batch, dmat->Info(), out_preds, model,
 +        shard.PredictInternal(batch, dmat->Info(), &batchPreds, model,
                               h_tree_segments, h_nodes, tree_begin, tree_end);
       });
 -      i_batch++;
 +
 +      // Copy the batch predictions to the global prediction array
 +      std::copy(batchPreds.ConstHostVector().begin(), batchPreds.ConstHostVector().end(), hitr);
 +      std::advance(hitr, batchPreds.Size());
 +
 +      // Sparse pages can be reused by the threaded iterator when there are a number
 +      // of them. Thus, for the Reshard to work in future pages, we empty out its
 +      // current distribution, as Reshard expect atleast the src/target distribution
 +      // to be empty
 +      batch.offset.Reshard(GPUDistribution());
 +      batch.data.Reshard(GPUDistribution());
     }
     monitor_.StopCuda("DevicePredictInternal");
   }
	diff --git a/src/common/host_device_vector.h b/src/common/host_device_vector.h
	index 425cbff..4cb1677 100644
	--- a/src/common/host_device_vector.h
	+++ b/src/common/host_device_vector.h
	@@ -132,7 +132,6 @@ class GPUDistribution {
	if (size == 0) { return 0; }
	if (offsets_.size() > 0) {
	// explicit offsets are provided
	- CHECK_EQ(offsets_.back(), size);
	return offsets_.at(index);
	}
	// no explicit offsets
	@@ -145,7 +144,7 @@ class GPUDistribution {
	if (size == 0) { return 0; }
	if (offsets_.size() > 0) {
	// explicit offsets are provided
	- CHECK_EQ(offsets_.back(), size);
	+ if (!offsets_.at(index+1)) return 0;
	return offsets_.at(index + 1) - offsets_.at(index) +
	(index == devices_.Size() - 1 ? overlap_ : 0);
	}
	diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
	index 39b5c87..639b977 100644
	--- a/src/predictor/gpu_predictor.cu
	+++ b/src/predictor/gpu_predictor.cu
	@@ -178,7 +178,6 @@ __device__ float GetLeafWeight(bst_uint ridx, const DevicePredictionNode* tree,
	return n.GetWeight();
	}

	-template <int BLOCK_THREADS>
	__global__ void PredictKernel(common::Span<const DevicePredictionNode> d_nodes,
	common::Span<float> d_out_predictions,
	common::Span<size_t> d_tree_segments,
	@@ -231,9 +230,16 @@ class GPUPredictor : public xgboost::Predictor {
	auto data_span = data.DeviceSpan(device);
	dh::safe_cuda(cudaSetDevice(device));
	// copy the last element from every shard
	- dh::safe_cuda(cudaMemcpy(&offsets.at(shard + 1),
	- &data_span[data_span.size()-1],
	- sizeof(size_t), cudaMemcpyDeviceToHost));
	+ if (!data_span.size()) {
	+ size_t zero_size = 0;
	+ dh::safe_cuda(cudaMemcpy(&offsets.at(shard + 1),
	+ &zero_size,
	+ sizeof(size_t), cudaMemcpyHostToHost));
	+ } else {
	+ dh::safe_cuda(cudaMemcpy(&offsets.at(shard + 1),
	+ &data_span[data_span.size()-1],
	+ sizeof(size_t), cudaMemcpyDeviceToHost));
	+ }
	}
	}

	@@ -267,7 +273,7 @@ class GPUPredictor : public xgboost::Predictor {
	cudaMemcpyHostToDevice));

	const int BLOCK_THREADS = 128;
	- size_t num_rows = batch.offset.DeviceSize(device_) - 1;
	+ ssize_t num_rows = batch.offset.DeviceSize(device_) - 1;
	if (num_rows < 1) { return; }

	const int GRID_SIZE = static_cast<int>(dh::DivRoundUp(num_rows, BLOCK_THREADS));
	@@ -283,8 +289,10 @@ class GPUPredictor : public xgboost::Predictor {
	size_t entry_start = data_distr.ShardStart(batch.data.Size(),
	data_distr.Devices().Index(device_));

	- PredictKernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS, shared_memory_bytes>>>
	- (dh::ToSpan(nodes_), predictions->DeviceSpan(device_), dh::ToSpan(tree_segments_),
	+ PredictKernel<<<GRID_SIZE, BLOCK_THREADS, shared_memory_bytes>>>
	+ (dh::ToSpan(nodes_),
	+ predictions->DeviceSpan(device_),
	+ dh::ToSpan(tree_segments_),
	dh::ToSpan(tree_group_), batch.offset.DeviceSpan(device_),
	batch.data.DeviceSpan(device_), tree_begin, tree_end, info.num_col_,
	num_rows, entry_start, use_shared, model.param.num_output_group);
	@@ -323,20 +331,59 @@ class GPUPredictor : public xgboost::Predictor {
	h_nodes.begin() + h_tree_segments[tree_idx - tree_begin]);
	}

	- size_t i_batch = 0;
	+ // Accumulate all predictions from multiple batches into this
	+ std::vector<float> &hvec = out_preds->HostVector();
	+ std::vector<float>::iterator hitr = hvec.begin();

	for (const auto &batch : dmat->GetRowBatches()) {
	- CHECK_EQ(i_batch, 0) << "External memory not supported";
	// out_preds have been resharded and resized in InitOutPredictions()
	batch.offset.Reshard(GPUDistribution::Overlap(devices_, 1));
	+
	std::vector<size_t> device_offsets;
	DeviceOffsets(batch.offset, &device_offsets);
	+
	batch.data.Reshard(GPUDistribution::Explicit(devices_, device_offsets));
	+
	+ // Gather the batch predictions; but, first reset the number of rows
	+ // to what is present in the batch. The local batch prediction should
	+ // be resharded based on the number of devices present already
	+ struct RowReset {
	+ public:
	+ RowReset(MetaInfo &minfo, const SparsePage &r_batch)
	+ : minfo_(minfo) {
	+ orig_nrows_ = minfo_.num_row_;
	+ minfo.num_row_ = r_batch.Size();
	+ }
	+
	+ ~RowReset() {
	+ minfo_.num_row_ = orig_nrows_;
	+ }
	+ private:
	+ MetaInfo &minfo_;
	+ size_t orig_nrows_;
	+ }rowgd(dmat->Info(), batch);
	+
	+ HostDeviceVector<bst_float> batchPreds;
	+ this->InitOutPredictions(dmat->Info(), &batchPreds, model);
	+ // Starting from hitr copy batch size * model.param.num_output_group
	+ // that were previously present into batchPreds
	+ std::copy(hitr, hitr + (batch.Size() * model.param.num_output_group), batchPreds.HostVector().begin());
	+
	dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
	- shard.PredictInternal(batch, dmat->Info(), out_preds, model,
	+ shard.PredictInternal(batch, dmat->Info(), &batchPreds, model,
	h_tree_segments, h_nodes, tree_begin, tree_end);
	});
	- i_batch++;
	+
	+ // Copy the batch predictions to the global prediction array
	+ std::copy(batchPreds.ConstHostVector().begin(), batchPreds.ConstHostVector().end(), hitr);
	+ std::advance(hitr, batchPreds.Size());
	+
	+ // Sparse pages can be reused by the threaded iterator when there are a number
	+ // of them. Thus, for the Reshard to work in future pages, we empty out its
	+ // current distribution, as Reshard expect atleast the src/target distribution
	+ // to be empty
	+ batch.offset.Reshard(GPUDistribution());
	+ batch.data.Reshard(GPUDistribution());
	}
	monitor_.StopCuda("DevicePredictInternal");
	}
No results found