Skip to content

Commit

Permalink
Fix Beam Search for CPU and CUDA... Also include it in our API (#669)
Browse files Browse the repository at this point in the history
### Summary
Beam Search was hanging and not outputting correct results. Furthermore,
it did not fit into our API design. This PR addresses the correctness
and API problems with Beam search. We plan to improve both CPU and CUDA
performance and memory efficiency in the near future.

### Issues Addressed
Here is a quick summary of the issues addressed by this PR... these
apply to both CPU and CUDA implementations:
- No log-softmax normalization was performed before adding beam scores.
This caused faulty outputs which did not match the ORT implementation.
- The `is_done` flag was not set or checked properly in the case of EOS
token or `max_sequence_length`. This caused hanging, infinite looping,
and memory buffer overflow. This sometimes gave the impression of bad
performance, while in reality it was a correctness issue.
- `Finalize` was not called automatically. If a user didn't call it
manually this could cause a floating point exception or other fault.
- There was no easy way to get output from Beam Search. `Finalize` was
clunky and unintuitive as it didn't fit with our API.
- Our testing file was not up to date with our latest APIs.


### API Changes
Given the issues with `Finalize`, this PR introduces an update to the
way Beam Search fits into our API. The user no longer has to manually
call `Finalize` in order to access the Beam Search results. These are
returned automatically by the `Generate()` function and can be accessed
using batch beam indexing.

---------

Co-authored-by: Baiju Meswani <[email protected]>
  • Loading branch information
aciddelgado and baijumeswani authored Jul 11, 2024
1 parent 6166902 commit e41fb2c
Show file tree
Hide file tree
Showing 16 changed files with 213 additions and 204 deletions.
59 changes: 8 additions & 51 deletions src/beam_search_scorer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ void BeamHypotheses::Init(float length_penalty, std::span<HypothesisScore> beams
done_ = false;
}

void BeamHypotheses::Add(std::span<const int32_t> hypothesis, float sum_logprobs) {
void BeamHypotheses::Add(cpu_span<int32_t> hypothesis, float sum_logprobs) {
auto length = hypothesis.size();
float const score = sum_logprobs / std::pow(static_cast<float>(length), length_penalty_);

Expand All @@ -43,28 +43,6 @@ bool BeamHypotheses::CanImprove(float best_sum_logprobs, int current_length) con
return beams_.back().score < current_score;
}

void BeamHypotheses::Output(
size_t top_k,
size_t max_length,
std::span<int32_t> sequences, // buffer filled with pad token ID, shape (num_return_sequences, max_length)
std::span<float> sequences_scores) const // buffer of shape (num_return_sequences) or empty
{
// Copy the top_k beams into the sequences
assert(top_k <= beams_used_);
for (int index = 0; index < top_k; index++) {
auto& item = beams_[index];
std::span<int32_t> const target = sequences.subspan(index * max_length, max_length);

// Note that word_ids might be less than max_length.
// Since the sequences has been filled with pad token ID, so padding is not needed here.
copy(item.hypothesis, target);

if (!sequences_scores.empty()) {
sequences_scores[index] = item.score;
}
}
}

BeamSearchScorer::BeamSearchScorer(const GeneratorParams& parameters)
: batch_size_{parameters.batch_size},
num_beams_{parameters.search.num_beams},
Expand Down Expand Up @@ -110,7 +88,7 @@ void BeamSearchScorer::Process(Sequences& sequences,
// It contains word ID of whole sequence generated so far.
// It is different from subgraph input_ids, which only need one word when past state is not empty.

const int sequence_length = sequences.GetSequenceLength();
size_t sequence_length = static_cast<size_t>(sequences.GetSequenceLength());

assert(next_scores.size() == next_tokens.size());
assert(next_scores.size() == next_indices.size());
Expand Down Expand Up @@ -146,11 +124,12 @@ void BeamSearchScorer::Process(Sequences& sequences,
}

// Clone the sequence and append to buffer.
std::span<const int32_t> const src = sequences.GetSequence(batch_beam_idx);
auto clone = hypothesis_buffer_.subspan(static_cast<size_t>(hypothesis_buffer_used_), sequence_length);
cpu_span<const int32_t> const src{sequences.GetSequence(batch_beam_idx)};
auto clone_span = hypothesis_buffer_.subspan(static_cast<size_t>(hypothesis_buffer_used_), sequence_length);
cpu_span<int32_t> clone{clone_span.data(), sequence_length};

copy(src, clone);
hypothesis_buffer_used_ += sequence_length;
hypothesis_buffer_used_ += static_cast<int>(sequence_length);
beam_hyp.Add(clone, next_score);
} else {
// Add next predicted token since it is not eos_token.
Expand All @@ -177,7 +156,7 @@ void BeamSearchScorer::Process(Sequences& sequences,
if (!early_stopping_) {
std::span<const float> const topk_scores = next_scores.subspan(batch * num_beams_, top_k);
const auto best_sum_logprobs = std::max_element(topk_scores.begin(), topk_scores.end());
if (beam_hyp.CanImprove(*best_sum_logprobs, sequence_length)) {
if (beam_hyp.CanImprove(*best_sum_logprobs, static_cast<int>(sequence_length))) {
continue;
}
}
Expand All @@ -188,12 +167,7 @@ void BeamSearchScorer::Process(Sequences& sequences,
}

void BeamSearchScorer::Finalize(Sequences& sequences,
size_t num_return_sequences,
cpu_span<int32_t> output,
cpu_span<float> sequence_scores) {
// output is Word IDs of each sequence, with shape (batch_size * num_return_sequences, max_sequence_length).
// sequence_scores is the optional Score of each sequence, with shape (batch_size * num_return_sequences).

size_t num_return_sequences) {
// Finalize all open beam hypotheses and add to generated hypotheses.
for (size_t batch_index = 0; batch_index < batch_size_; batch_index++) {
BeamHypotheses& beam_hyp = beam_hyps_[batch_index];
Expand All @@ -208,23 +182,6 @@ void BeamSearchScorer::Finalize(Sequences& sequences,
beam_hyp.Add(final_tokens, final_score);
}
}

// Fill output sequences with pad token ID so that we do not need append it later.
std::fill_n(output.data(), output.size(), pad_token_id_);

// Select the best hypotheses according to number of sequences to return.
for (size_t batch_index = 0; batch_index < batch_size_; batch_index++) {
BeamHypotheses& beam_hyp = beam_hyps_[batch_index];

auto batch_output = output.subspan(batch_index * num_return_sequences * max_length_,
num_return_sequences * max_length_);
std::span<float> sequence_scores_buffer;
if (!sequence_scores.empty()) {
sequence_scores_buffer = sequence_scores.subspan(batch_index * num_return_sequences, num_return_sequences);
}

beam_hyp.Output(num_return_sequences, max_length_, batch_output, sequence_scores_buffer);
}
}

} // namespace Generators
20 changes: 9 additions & 11 deletions src/beam_search_scorer.h
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#include "sequences.h"
#pragma once
// The implementation is based on huggingface transformers generation_beam_search.py
namespace Generators {

struct HypothesisScore {
std::span<const int32_t> hypothesis;
cpu_span<int32_t> hypothesis;
float score;
};

Expand All @@ -14,16 +15,14 @@ struct BeamHypotheses {
void Init(float length_penalty, std::span<HypothesisScore> beams);

// Add a new hypothesis
void Add(std::span<const int32_t> hypothesis, float sum_logprobs);
void Add(cpu_span<int32_t> hypothesis, float sum_logprobs);

// Return true if this beats the worst score in the hypothesis
bool CanImprove(float best_sum_logprobs, int current_length) const;

// Output results
void Output(size_t top_k, // number of sequences to return
size_t max_length, // max sequence length
std::span<int32_t> sequences, // buffer with pad token, shape (num_return_sequences, max_length)
std::span<float> sequences_scores) const; // buffer for sequence scores, with shape (num_return_sequences)
RoamingArray<int32_t> GetHypothesis(size_t index) const { return beams_[index].hypothesis; }

// TODO(aciddelgado): Methods to get all hypotheses and scores

std::span<HypothesisScore> beams_; // Beam width sized array of hypotheses, sorted by highest scoring
int beams_used_; // Number of elements used in beams_
Expand All @@ -40,15 +39,14 @@ struct BeamSearchScorer {
std::span<const int32_t> next_indices);

void Finalize(Sequences& sequences,
size_t num_return_sequences,
cpu_span<int32_t> output_sequences,
cpu_span<float> output_sequence_scores);
size_t num_return_sequences);

bool IsDone() const { return not_done_count_ == 0; }

cpu_span<float> GetNextScores() { return next_beam_scores_; }
cpu_span<int32_t> GetNextTokens() { return next_beam_tokens_; }
cpu_span<int32_t> GetNextIndicesCPU() { return next_beam_indices_; }
BeamHypotheses GetBeamHypotheses(size_t batch_id) { return beam_hyps_[batch_id]; }

private:
int batch_size_;
Expand Down
17 changes: 12 additions & 5 deletions src/beam_search_scorer_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,18 @@ bool BeamSearchScorer_Cuda::IsDoneLater() const {
}

void BeamSearchScorer_Cuda::Finalize(Sequences_Cuda& sequences,
size_t num_return_sequences,
std::span<int32_t> output, // Word IDs of each sequence, with shape (batch_size * num_return_sequences, max_sequence_length)
std::span<float> sequence_scores) { // Score of each sequence, with shape (batch_size * num_return_sequences).
assert(!output.empty());
cuda::LaunchBeamSearchScorer_Finalize(state_cpu_->batch_size_, *state_gpu_, sequences.GetSequences(), sequences.GetSequenceLength(), beam_hyps_, next_beam_scores_, output, sequence_scores, stream_);
size_t num_return_sequences) {
cuda::LaunchBeamSearchScorer_Finalize(state_cpu_->batch_size_, *state_gpu_, sequences.GetSequences(), sequences.GetSequenceLength(), beam_hyps_, next_beam_scores_, stream_);
}

RoamingArray<int32_t> BeamSearchScorer_Cuda::GetBeamHypothesis(size_t batch_id, size_t beam_id) const {
cuda_host_unique_ptr<int32_t*> hypothesis_ptr = CudaMallocHostArray<int32_t*>(1);
cuda_host_unique_ptr<int> hypothesis_length = CudaMallocHostArray<int>(1);
cuda_host_unique_ptr<float> hypothesis_score = CudaMallocHostArray<float>(1);
cuda::LaunchBeamSearchScorer_GetHypothesisPtr(batch_id, beam_id, beam_hyps_, hypothesis_ptr.get(), hypothesis_length.get(), hypothesis_score.get(), stream_);
CudaCheck() == cudaStreamSynchronize(stream_);
std::span<int32_t> hypothesis_span(*hypothesis_ptr.get(), *hypothesis_length.get());
return gpu_span<int32_t>{hypothesis_span.data(), hypothesis_span.size()};
}

} // namespace Generators
81 changes: 33 additions & 48 deletions src/beam_search_scorer_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -71,30 +71,6 @@ __device__ bool BeamHypotheses::CanImprove(float best_sum_logprobs, int current_
return beams_[beams_count_ - 1].score < current_score;
}

__device__ void BeamHypotheses::Output(
int top_k,
int max_length,
int pad_token_id,
int32_t* sequences, // buffer of shape (num_return_sequences, max_length)
float* sequences_scores) // buffer of shape (num_return_sequences) or empty
{
// Copy the top_k beams into the sequences
for (int index = 0; index < top_k; index++) {
auto& item = beams_[index];
int32_t* target = sequences + index * max_length;

// Note that word_ids might be less than max_length.
for (int i = 0; i < item.hypothesis_length; i++)
target[i] = item.hypothesis[i];
// Pad remaining values with pad token id
for (int i = item.hypothesis_length; i < max_length; i++)
target[i] = pad_token_id;

if (sequences_scores)
sequences_scores[index] = item.score;
}
}

__global__ void BeamSearchScorer_Process(BeamScorerState& state_cpu,
BeamScorerState& state,
const int32_t* sequences_buffer,
Expand Down Expand Up @@ -132,7 +108,7 @@ __global__ void BeamSearchScorer_Process(BeamScorerState& state_cpu,
continue;
}

// Clone the sequence and append to buffer.
// Clone the sequence and append to buffer. // TODO(aciddelgado): why do we need to clone the sequence here?
const int32_t* src = sequences_buffer + batch_beam_idx * state.max_length_;
auto clone = hypothesis_buffer_ + atomicAdd(&state.hypothesis_buffer_used_, sequence_length);

Expand All @@ -156,7 +132,7 @@ __global__ void BeamSearchScorer_Process(BeamScorerState& state_cpu,
if (beam_hyp.beams_used_ == state.num_beams_) {
if (state.early_stopping_ || !beam_hyp.CanImprove(*std::max_element(next_scores + batch_start, next_scores + batch_start + top_k), sequence_length)) {
beam_hyp.done_ = true;
if (atomicAdd(&state.not_done_count_, -1) == 0)
if (atomicAdd(&state.not_done_count_, -1) == 1)
state_cpu.not_done_count_ = 0; // Update the CPU side
}
}
Expand Down Expand Up @@ -238,8 +214,7 @@ void LaunchBeamSearchScorer_AppendNextTokenToSequences(BeamScorerState& state_cp
block_size.x = batch_beam_size;
block_size.y = sequence_length;
} else {
if (sequence_length <= max_threads) // Sequence length fits into thread block, but batch_beam_size does not, so chunk it
{
if (sequence_length <= max_threads) { // Sequence length fits into thread block, but batch_beam_size does not, so chunk it
block_size.x = max_threads / sequence_length;
block_size.y = sequence_length;

Expand Down Expand Up @@ -269,9 +244,7 @@ __global__ void BeamSearchScorer_Finalize(BeamScorerState& state,
const int32_t* sequences_buffer,
int sequence_length,
BeamHypotheses* beam_hyps_,
const float* final_beam_scores,
int32_t* output,
float* sequence_scores) {
const float* final_beam_scores) {
int batch_index = blockIdx.x * blockDim.x + threadIdx.x;
if (batch_index >= state.batch_size_)
return;
Expand All @@ -286,18 +259,6 @@ __global__ void BeamSearchScorer_Finalize(BeamScorerState& state,
beam_hyp.Add(final_tokens, sequence_length, final_score);
}
}

int num_return_sequences = 1;

// Select the best hypotheses according to number of sequences to return.
auto batch_output = output + batch_index * num_return_sequences * state.max_length_;

beam_hyp.Output(
num_return_sequences,
state.max_length_,
state.pad_token_id_,
batch_output,
sequence_scores ? sequence_scores + batch_index * num_return_sequences : nullptr);
}

void LaunchBeamSearchScorer_Finalize(int batch_size,
Expand All @@ -306,16 +267,40 @@ void LaunchBeamSearchScorer_Finalize(int batch_size,
int sequence_length,
std::span<BeamHypotheses> beam_hyps,
std::span<const float> final_beam_scores,
std::span<int32_t> output,
std::span<float> sequence_scores,
cudaStream_t stream) {
BeamSearchScorer_Finalize<<<1, batch_size, 0, stream>>>(state,
sequences.data(),
sequence_length,
beam_hyps.data(),
final_beam_scores.data(),
output.data(),
sequence_scores.data());
final_beam_scores.data());
}

__global__ void BeamSearchScorer_GetHypothesisPtr(size_t batch_id,
size_t beam_id,
BeamHypotheses* beam_hyps_data,
int32_t** hypothesis_ptr,
int* hypothesis_length,
float* hypothesis_score) {
auto& beam_hyp = beam_hyps_data[batch_id];
auto& item = beam_hyp.beams_[beam_id];
hypothesis_ptr[0] = const_cast<int32_t*>(item.hypothesis);
hypothesis_length[0] = item.hypothesis_length;
hypothesis_score[0] = item.score;
}

void LaunchBeamSearchScorer_GetHypothesisPtr(size_t batch_id,
size_t beam_id,
gpu_span<BeamHypotheses> beam_hyps,
int32_t** hypothesis_ptr,
int* hypothesis_length,
float* hypothesis_score,
cudaStream_t stream) {
BeamSearchScorer_GetHypothesisPtr<<<1, 1, 0, stream>>>(batch_id,
beam_id,
beam_hyps.data(),
hypothesis_ptr,
hypothesis_length,
hypothesis_score);
}

__global__ void InitScoresKernel(float* beam_scores,
Expand Down
20 changes: 11 additions & 9 deletions src/beam_search_scorer_cuda.cuh
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#include "smartptrs.h"

namespace Generators {
namespace cuda {

Expand All @@ -19,13 +21,6 @@ struct BeamHypotheses {

// Return true if this beats the worst score in the hypothesis
__device__ bool CanImprove(float best_sum_logprobs, int current_length) const;

// Output results
__device__ void Output(int top_k, // number of sequences to return
int max_length, // max sequence length
int pad_token_id, // pad token
int32_t* sequences, // buffer with pad token, shape (num_return_sequences, max_length)
float* sequences_scores); // buffer for sequence scores, with shape (num_return_sequences)
};

struct BeamScorerState {
Expand Down Expand Up @@ -71,10 +66,17 @@ void LaunchBeamSearchScorer_Finalize(int batch_size,
int sequence_length,
std::span<BeamHypotheses> beam_hyps_,
std::span<const float> final_beam_scores,
std::span<int32_t> output,
std::span<float> sequence_scores,
cudaStream_t stream);

// Since we need to index through a couple layers of GPU memory, we need to provide a way to get the pointers
void LaunchBeamSearchScorer_GetHypothesisPtr(size_t batch_id,
size_t beam_id,
gpu_span<BeamHypotheses> beam_hyps,
int32_t** hypothesis_ptr,
int* hypothesis_length,
float* hypothesis_score,
cudaStream_t stream);

void LaunchInitScoresKernel(float* beam_scores,
int batch_size,
int num_beams,
Expand Down
5 changes: 2 additions & 3 deletions src/beam_search_scorer_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,7 @@ struct BeamSearchScorer_Cuda {
std::span<const int32_t> next_indices);

void Finalize(Sequences_Cuda& sequences,
size_t num_return_sequences,
std::span<int32_t> output_sequences,
std::span<float> output_sequence_scores);
size_t num_return_sequences);

bool IsDone() const { return false; } // For CUDA we speculatively run the next step while we wait for the GPU to report status. We use 'IsDoneLater()' for this
bool IsDoneLater() const;
Expand All @@ -24,6 +22,7 @@ struct BeamSearchScorer_Cuda {
return next_beam_indices_cpu_;
}
gpu_span<int32_t> GetNextIndicesGPU() { return next_beam_indices_; }
RoamingArray<int32_t> GetBeamHypothesis(size_t batch_id, size_t beam_id) const;

private:
mutable cuda_event_holder event_process_complete_;
Expand Down
3 changes: 2 additions & 1 deletion src/cuda_sampling.cu
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ __global__ void SoftmaxBlockForward(outscalar_t* output, scalar_t* input, int cl

template <bool is_log_softmax>
void DispatchBlockwiseSoftmaxForward(cudaStream_t* stream, float* output, const float* input, int softmax_elements,
int input_stride, int output_stride, int batch_count, float temperature=1.0) {
int input_stride, int output_stride, int batch_count, float temperature) {
dim3 grid(batch_count);
constexpr int ILP = sizeof(float4) / sizeof(float);
dim3 block = SoftmaxGetBlockSize(ILP, softmax_elements);
Expand All @@ -313,6 +313,7 @@ void DispatchBlockwiseSoftmaxForward(cudaStream_t* stream, float* output, const
softmax_elements, input_stride, output_stride, temperature);
}
}
template void DispatchBlockwiseSoftmaxForward<true>(cudaStream_t*, float*, const float*, int, int, int, int, float);

// Populate Kernels and Launchers

Expand Down
Loading

0 comments on commit e41fb2c

Please sign in to comment.