Fix Beam Search for CPU and CUDA... Also include it in our API (#669)

### Summary Beam Search was hanging and not outputting correct results. Furthermore, it did not fit into our API design. This PR addresses the correctness and API problems with Beam search. We plan to improve both CPU and CUDA performance and memory efficiency in the near future. ### Issues Addressed Here is a quick summary of the issues addressed by this PR... these apply to both CPU and CUDA implementations: - No log-softmax normalization was performed before adding beam scores. This caused faulty outputs which did not match the ORT implementation. - The `is_done` flag was not set or checked properly in the case of EOS token or `max_sequence_length`. This caused hanging, infinite looping, and memory buffer overflow. This sometimes gave the impression of bad performance, while in reality it was a correctness issue. - `Finalize` was not called automatically. If a user didn't call it manually this could cause a floating point exception or other fault. - There was no easy way to get output from Beam Search. `Finalize` was clunky and unintuitive as it didn't fit with our API. - Our testing file was not up to date with our latest APIs. ### API Changes Given the issues with `Finalize`, this PR introduces an update to the way Beam Search fits into our API. The user no longer has to manually call `Finalize` in order to access the Beam Search results. These are returned automatically by the `Generate()` function and can be accessed using batch beam indexing. --------- Co-authored-by: Baiju Meswani <[email protected]>
microsoft · Jul 11, 2024 · e41fb2c · e41fb2c
1 parent 6166902
commit e41fb2c
Show file tree

Hide file tree

Showing 16 changed files with 213 additions and 204 deletions.
diff --git a/src/beam_search_scorer.cpp b/src/beam_search_scorer.cpp
@@ -16,7 +16,7 @@ void BeamHypotheses::Init(float length_penalty, std::span<HypothesisScore> beams
   done_ = false;
 }
 
-void BeamHypotheses::Add(std::span<const int32_t> hypothesis, float sum_logprobs) {
+void BeamHypotheses::Add(cpu_span<int32_t> hypothesis, float sum_logprobs) {
   auto length = hypothesis.size();
   float const score = sum_logprobs / std::pow(static_cast<float>(length), length_penalty_);
 
@@ -43,28 +43,6 @@ bool BeamHypotheses::CanImprove(float best_sum_logprobs, int current_length) con
   return beams_.back().score < current_score;
 }
 
-void BeamHypotheses::Output(
-    size_t top_k,
-    size_t max_length,
-    std::span<int32_t> sequences,             // buffer filled with pad token ID, shape (num_return_sequences, max_length)
-    std::span<float> sequences_scores) const  // buffer of shape (num_return_sequences) or empty
-{
-  // Copy the top_k beams into the sequences
-  assert(top_k <= beams_used_);
-  for (int index = 0; index < top_k; index++) {
-    auto& item = beams_[index];
-    std::span<int32_t> const target = sequences.subspan(index * max_length, max_length);
-
-    // Note that word_ids might be less than max_length.
-    // Since the sequences has been filled with pad token ID, so padding is not needed here.
-    copy(item.hypothesis, target);
-
-    if (!sequences_scores.empty()) {
-      sequences_scores[index] = item.score;
-    }
-  }
-}
-
 BeamSearchScorer::BeamSearchScorer(const GeneratorParams& parameters)
     : batch_size_{parameters.batch_size},
       num_beams_{parameters.search.num_beams},
@@ -110,7 +88,7 @@ void BeamSearchScorer::Process(Sequences& sequences,
   // It contains word ID of whole sequence generated so far.
   // It is different from subgraph input_ids, which only need one word when past state is not empty.
 
-  const int sequence_length = sequences.GetSequenceLength();
+  size_t sequence_length = static_cast<size_t>(sequences.GetSequenceLength());
 
   assert(next_scores.size() == next_tokens.size());
   assert(next_scores.size() == next_indices.size());
@@ -146,11 +124,12 @@ void BeamSearchScorer::Process(Sequences& sequences,
         }
 
         // Clone the sequence and append to buffer.
-        std::span<const int32_t> const src = sequences.GetSequence(batch_beam_idx);
-        auto clone = hypothesis_buffer_.subspan(static_cast<size_t>(hypothesis_buffer_used_), sequence_length);
+        cpu_span<const int32_t> const src{sequences.GetSequence(batch_beam_idx)};
+        auto clone_span = hypothesis_buffer_.subspan(static_cast<size_t>(hypothesis_buffer_used_), sequence_length);
+        cpu_span<int32_t> clone{clone_span.data(), sequence_length};
 
         copy(src, clone);
-        hypothesis_buffer_used_ += sequence_length;
+        hypothesis_buffer_used_ += static_cast<int>(sequence_length);
         beam_hyp.Add(clone, next_score);
       } else {
         // Add next predicted token since it is not eos_token.
@@ -177,7 +156,7 @@ void BeamSearchScorer::Process(Sequences& sequences,
     if (!early_stopping_) {
       std::span<const float> const topk_scores = next_scores.subspan(batch * num_beams_, top_k);
       const auto best_sum_logprobs = std::max_element(topk_scores.begin(), topk_scores.end());
-      if (beam_hyp.CanImprove(*best_sum_logprobs, sequence_length)) {
+      if (beam_hyp.CanImprove(*best_sum_logprobs, static_cast<int>(sequence_length))) {
         continue;
       }
     }
@@ -188,12 +167,7 @@ void BeamSearchScorer::Process(Sequences& sequences,
 }
 
 void BeamSearchScorer::Finalize(Sequences& sequences,
-                                size_t num_return_sequences,
-                                cpu_span<int32_t> output,
-                                cpu_span<float> sequence_scores) {
-  // output is Word IDs of each sequence, with shape (batch_size * num_return_sequences, max_sequence_length).
-  // sequence_scores is the optional Score of each sequence, with shape (batch_size * num_return_sequences).
-
+                                size_t num_return_sequences) {
   // Finalize all open beam hypotheses and add to generated hypotheses.
   for (size_t batch_index = 0; batch_index < batch_size_; batch_index++) {
     BeamHypotheses& beam_hyp = beam_hyps_[batch_index];
@@ -208,23 +182,6 @@ void BeamSearchScorer::Finalize(Sequences& sequences,
       beam_hyp.Add(final_tokens, final_score);
     }
   }
-
-  // Fill output sequences with pad token ID so that we do not need append it later.
-  std::fill_n(output.data(), output.size(), pad_token_id_);
-
-  // Select the best hypotheses according to number of sequences to return.
-  for (size_t batch_index = 0; batch_index < batch_size_; batch_index++) {
-    BeamHypotheses& beam_hyp = beam_hyps_[batch_index];
-
-    auto batch_output = output.subspan(batch_index * num_return_sequences * max_length_,
-                                       num_return_sequences * max_length_);
-    std::span<float> sequence_scores_buffer;
-    if (!sequence_scores.empty()) {
-      sequence_scores_buffer = sequence_scores.subspan(batch_index * num_return_sequences, num_return_sequences);
-    }
-
-    beam_hyp.Output(num_return_sequences, max_length_, batch_output, sequence_scores_buffer);
-  }
 }
 
 }  // namespace Generators
diff --git a/src/beam_search_scorer.h b/src/beam_search_scorer.h
@@ -1,11 +1,12 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-
+#include "sequences.h"
+#pragma once
 // The implementation is based on huggingface transformers generation_beam_search.py
 namespace Generators {
 
 struct HypothesisScore {
-  std::span<const int32_t> hypothesis;
+  cpu_span<int32_t> hypothesis;
   float score;
 };
 
@@ -14,16 +15,14 @@ struct BeamHypotheses {
   void Init(float length_penalty, std::span<HypothesisScore> beams);
 
   // Add a new hypothesis
-  void Add(std::span<const int32_t> hypothesis, float sum_logprobs);
+  void Add(cpu_span<int32_t> hypothesis, float sum_logprobs);
 
   // Return true if this beats the worst score in the hypothesis
   bool CanImprove(float best_sum_logprobs, int current_length) const;
 
-  // Output results
-  void Output(size_t top_k,                              // number of sequences to return
-              size_t max_length,                         // max sequence length
-              std::span<int32_t> sequences,              // buffer with pad token, shape (num_return_sequences, max_length)
-              std::span<float> sequences_scores) const;  // buffer for sequence scores, with shape (num_return_sequences)
+  RoamingArray<int32_t> GetHypothesis(size_t index) const { return beams_[index].hypothesis; }
+
+  // TODO(aciddelgado): Methods to get all hypotheses and scores
 
   std::span<HypothesisScore> beams_;  // Beam width sized array of hypotheses, sorted by highest scoring
   int beams_used_;                    // Number of elements used in beams_
@@ -40,15 +39,14 @@ struct BeamSearchScorer {
                std::span<const int32_t> next_indices);
 
   void Finalize(Sequences& sequences,
-                size_t num_return_sequences,
-                cpu_span<int32_t> output_sequences,
-                cpu_span<float> output_sequence_scores);
+                size_t num_return_sequences);
 
   bool IsDone() const { return not_done_count_ == 0; }
 
   cpu_span<float> GetNextScores() { return next_beam_scores_; }
   cpu_span<int32_t> GetNextTokens() { return next_beam_tokens_; }
   cpu_span<int32_t> GetNextIndicesCPU() { return next_beam_indices_; }
+  BeamHypotheses GetBeamHypotheses(size_t batch_id) { return beam_hyps_[batch_id]; }
 
  private:
   int batch_size_;

diff --git a/src/beam_search_scorer_cuda.cpp b/src/beam_search_scorer_cuda.cpp
@@ -76,11 +76,18 @@ bool BeamSearchScorer_Cuda::IsDoneLater() const {
 }
 
 void BeamSearchScorer_Cuda::Finalize(Sequences_Cuda& sequences,
-                                     size_t num_return_sequences,
-                                     std::span<int32_t> output,           // Word IDs of each sequence, with shape (batch_size * num_return_sequences, max_sequence_length)
-                                     std::span<float> sequence_scores) {  // Score of each sequence, with shape (batch_size * num_return_sequences).
-  assert(!output.empty());
-  cuda::LaunchBeamSearchScorer_Finalize(state_cpu_->batch_size_, *state_gpu_, sequences.GetSequences(), sequences.GetSequenceLength(), beam_hyps_, next_beam_scores_, output, sequence_scores, stream_);
+                                     size_t num_return_sequences) {
+  cuda::LaunchBeamSearchScorer_Finalize(state_cpu_->batch_size_, *state_gpu_, sequences.GetSequences(), sequences.GetSequenceLength(), beam_hyps_, next_beam_scores_, stream_);
+}
+
+RoamingArray<int32_t> BeamSearchScorer_Cuda::GetBeamHypothesis(size_t batch_id, size_t beam_id) const {
+  cuda_host_unique_ptr<int32_t*> hypothesis_ptr = CudaMallocHostArray<int32_t*>(1);
+  cuda_host_unique_ptr<int> hypothesis_length = CudaMallocHostArray<int>(1);
+  cuda_host_unique_ptr<float> hypothesis_score = CudaMallocHostArray<float>(1);
+  cuda::LaunchBeamSearchScorer_GetHypothesisPtr(batch_id, beam_id, beam_hyps_, hypothesis_ptr.get(), hypothesis_length.get(), hypothesis_score.get(), stream_);
+  CudaCheck() == cudaStreamSynchronize(stream_);
+  std::span<int32_t> hypothesis_span(*hypothesis_ptr.get(), *hypothesis_length.get());
+  return gpu_span<int32_t>{hypothesis_span.data(), hypothesis_span.size()};
 }
 
 }  // namespace Generators
diff --git a/src/beam_search_scorer_cuda.cu b/src/beam_search_scorer_cuda.cu
@@ -71,30 +71,6 @@ __device__ bool BeamHypotheses::CanImprove(float best_sum_logprobs, int current_
   return beams_[beams_count_ - 1].score < current_score;
 }
 
-__device__ void BeamHypotheses::Output(
-    int top_k,
-    int max_length,
-    int pad_token_id,
-    int32_t* sequences,       // buffer of shape (num_return_sequences, max_length)
-    float* sequences_scores)  // buffer of shape (num_return_sequences) or empty
-{
-  // Copy the top_k beams into the sequences
-  for (int index = 0; index < top_k; index++) {
-    auto& item = beams_[index];
-    int32_t* target = sequences + index * max_length;
-
-    // Note that word_ids might be less than max_length.
-    for (int i = 0; i < item.hypothesis_length; i++)
-      target[i] = item.hypothesis[i];
-    // Pad remaining values with pad token id
-    for (int i = item.hypothesis_length; i < max_length; i++)
-      target[i] = pad_token_id;
-
-    if (sequences_scores)
-      sequences_scores[index] = item.score;
-  }
-}
-
 __global__ void BeamSearchScorer_Process(BeamScorerState& state_cpu,
                                          BeamScorerState& state,
                                          const int32_t* sequences_buffer,
@@ -132,7 +108,7 @@ __global__ void BeamSearchScorer_Process(BeamScorerState& state_cpu,
           continue;
         }
 
-        // Clone the sequence and append to buffer.
+        // Clone the sequence and append to buffer. // TODO(aciddelgado): why do we need to clone the sequence here?
         const int32_t* src = sequences_buffer + batch_beam_idx * state.max_length_;
         auto clone = hypothesis_buffer_ + atomicAdd(&state.hypothesis_buffer_used_, sequence_length);
 
@@ -156,7 +132,7 @@ __global__ void BeamSearchScorer_Process(BeamScorerState& state_cpu,
     if (beam_hyp.beams_used_ == state.num_beams_) {
       if (state.early_stopping_ || !beam_hyp.CanImprove(*std::max_element(next_scores + batch_start, next_scores + batch_start + top_k), sequence_length)) {
         beam_hyp.done_ = true;
-        if (atomicAdd(&state.not_done_count_, -1) == 0)
+        if (atomicAdd(&state.not_done_count_, -1) == 1)
           state_cpu.not_done_count_ = 0;  // Update the CPU side
       }
     }
@@ -238,8 +214,7 @@ void LaunchBeamSearchScorer_AppendNextTokenToSequences(BeamScorerState& state_cp
     block_size.x = batch_beam_size;
     block_size.y = sequence_length;
   } else {
-    if (sequence_length <= max_threads)  // Sequence length fits into thread block, but batch_beam_size does not, so chunk it
-    {
+    if (sequence_length <= max_threads) {  // Sequence length fits into thread block, but batch_beam_size does not, so chunk it
       block_size.x = max_threads / sequence_length;
       block_size.y = sequence_length;
 
@@ -269,9 +244,7 @@ __global__ void BeamSearchScorer_Finalize(BeamScorerState& state,
                                           const int32_t* sequences_buffer,
                                           int sequence_length,
                                           BeamHypotheses* beam_hyps_,
-                                          const float* final_beam_scores,
-                                          int32_t* output,
-                                          float* sequence_scores) {
+                                          const float* final_beam_scores) {
   int batch_index = blockIdx.x * blockDim.x + threadIdx.x;
   if (batch_index >= state.batch_size_)
     return;
@@ -286,18 +259,6 @@ __global__ void BeamSearchScorer_Finalize(BeamScorerState& state,
       beam_hyp.Add(final_tokens, sequence_length, final_score);
     }
   }
-
-  int num_return_sequences = 1;
-
-  // Select the best hypotheses according to number of sequences to return.
-  auto batch_output = output + batch_index * num_return_sequences * state.max_length_;
-
-  beam_hyp.Output(
-      num_return_sequences,
-      state.max_length_,
-      state.pad_token_id_,
-      batch_output,
-      sequence_scores ? sequence_scores + batch_index * num_return_sequences : nullptr);
 }
 
 void LaunchBeamSearchScorer_Finalize(int batch_size,
@@ -306,16 +267,40 @@ void LaunchBeamSearchScorer_Finalize(int batch_size,
                                      int sequence_length,
                                      std::span<BeamHypotheses> beam_hyps,
                                      std::span<const float> final_beam_scores,
-                                     std::span<int32_t> output,
-                                     std::span<float> sequence_scores,
                                      cudaStream_t stream) {
   BeamSearchScorer_Finalize<<<1, batch_size, 0, stream>>>(state,
                                                           sequences.data(),
                                                           sequence_length,
                                                           beam_hyps.data(),
-                                                          final_beam_scores.data(),
-                                                          output.data(),
-                                                          sequence_scores.data());
+                                                          final_beam_scores.data());
+}
+
+__global__ void BeamSearchScorer_GetHypothesisPtr(size_t batch_id,
+                                                  size_t beam_id,
+                                                  BeamHypotheses* beam_hyps_data,
+                                                  int32_t** hypothesis_ptr,
+                                                  int* hypothesis_length,
+                                                  float* hypothesis_score) {
+  auto& beam_hyp = beam_hyps_data[batch_id];
+  auto& item = beam_hyp.beams_[beam_id];
+  hypothesis_ptr[0] = const_cast<int32_t*>(item.hypothesis);
+  hypothesis_length[0] = item.hypothesis_length;
+  hypothesis_score[0] = item.score;
+}
+
+void LaunchBeamSearchScorer_GetHypothesisPtr(size_t batch_id,
+                                             size_t beam_id,
+                                             gpu_span<BeamHypotheses> beam_hyps,
+                                             int32_t** hypothesis_ptr,
+                                             int* hypothesis_length,
+                                             float* hypothesis_score,
+                                             cudaStream_t stream) {
+  BeamSearchScorer_GetHypothesisPtr<<<1, 1, 0, stream>>>(batch_id,
+                                                         beam_id,
+                                                         beam_hyps.data(),
+                                                         hypothesis_ptr,
+                                                         hypothesis_length,
+                                                         hypothesis_score);
 }
 
 __global__ void InitScoresKernel(float* beam_scores,

diff --git a/src/beam_search_scorer_cuda.cuh b/src/beam_search_scorer_cuda.cuh
@@ -1,3 +1,5 @@
+#include "smartptrs.h"
+
 namespace Generators {
 namespace cuda {
 
@@ -19,13 +21,6 @@ struct BeamHypotheses {
 
   // Return true if this beats the worst score in the hypothesis
   __device__ bool CanImprove(float best_sum_logprobs, int current_length) const;
-
-  // Output results
-  __device__ void Output(int top_k,                 // number of sequences to return
-                         int max_length,            // max sequence length
-                         int pad_token_id,          // pad token
-                         int32_t* sequences,        // buffer with pad token, shape (num_return_sequences, max_length)
-                         float* sequences_scores);  // buffer for sequence scores, with shape (num_return_sequences)
 };
 
 struct BeamScorerState {
@@ -71,10 +66,17 @@ void LaunchBeamSearchScorer_Finalize(int batch_size,
                                      int sequence_length,
                                      std::span<BeamHypotheses> beam_hyps_,
                                      std::span<const float> final_beam_scores,
-                                     std::span<int32_t> output,
-                                     std::span<float> sequence_scores,
                                      cudaStream_t stream);
 
+// Since we need to index through a couple layers of GPU memory, we need to provide a way to get the pointers
+void LaunchBeamSearchScorer_GetHypothesisPtr(size_t batch_id,
+                                             size_t beam_id,
+                                             gpu_span<BeamHypotheses> beam_hyps,
+                                             int32_t** hypothesis_ptr,
+                                             int* hypothesis_length,
+                                             float* hypothesis_score,
+                                             cudaStream_t stream);
+
 void LaunchInitScoresKernel(float* beam_scores,
                             int batch_size,
                             int num_beams,

diff --git a/src/beam_search_scorer_cuda.h b/src/beam_search_scorer_cuda.h
@@ -9,9 +9,7 @@ struct BeamSearchScorer_Cuda {
                std::span<const int32_t> next_indices);
 
   void Finalize(Sequences_Cuda& sequences,
-                size_t num_return_sequences,
-                std::span<int32_t> output_sequences,
-                std::span<float> output_sequence_scores);
+                size_t num_return_sequences);
 
   bool IsDone() const { return false; }  // For CUDA we speculatively run the next step while we wait for the GPU to report status. We use 'IsDoneLater()' for this
   bool IsDoneLater() const;
@@ -24,6 +22,7 @@ struct BeamSearchScorer_Cuda {
     return next_beam_indices_cpu_;
   }
   gpu_span<int32_t> GetNextIndicesGPU() { return next_beam_indices_; }
+  RoamingArray<int32_t> GetBeamHypothesis(size_t batch_id, size_t beam_id) const;
 
  private:
   mutable cuda_event_holder event_process_complete_;

diff --git a/src/cuda_sampling.cu b/src/cuda_sampling.cu
@@ -299,7 +299,7 @@ __global__ void SoftmaxBlockForward(outscalar_t* output, scalar_t* input, int cl
 
 template <bool is_log_softmax>
 void DispatchBlockwiseSoftmaxForward(cudaStream_t* stream, float* output, const float* input, int softmax_elements,
-                                          int input_stride, int output_stride, int batch_count, float temperature=1.0) {
+                                          int input_stride, int output_stride, int batch_count, float temperature) {
   dim3 grid(batch_count);
   constexpr int ILP = sizeof(float4) / sizeof(float);
   dim3 block = SoftmaxGetBlockSize(ILP, softmax_elements);
@@ -313,6 +313,7 @@ void DispatchBlockwiseSoftmaxForward(cudaStream_t* stream, float* output, const
                                                            softmax_elements, input_stride, output_stride, temperature);
   }
 }
+template void DispatchBlockwiseSoftmaxForward<true>(cudaStream_t*, float*, const float*, int, int, int, int, float);
 
 // Populate Kernels and Launchers