doc/html/sorter_8cc_source.html

 // Copyright 2013 Cloudera Inc.

 //

 // Licensed under the Apache License, Version 2.0 (the "License");

 // you may not use this file except in compliance with the License.

 // You may obtain a copy of the License at

 //

 // http://www.apache.org/licenses/LICENSE-2.0

 //

 // Unless required by applicable law or agreed to in writing, software

 // distributed under the License is distributed on an "AS IS" BASIS,

 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 // See the License for the specific language governing permissions and

 // limitations under the License.


 #include "runtime/sorter.h"

 #include <gutil/strings/substitute.h>


 #include "runtime/buffered-block-mgr.h"

 #include "runtime/row-batch.h"

 #include "runtime/runtime-state.h"

 #include "runtime/sorted-run-merger.h"

 #include "util/runtime-profile.h"


 #include "common/names.h"


 using namespace strings;


 namespace impala {


 // Number of pinned blocks required for a merge.

 const int BLOCKS_REQUIRED_FOR_MERGE = 3;


 // Error message when pinning fixed or variable length blocks failed.

 // TODO: Add the node id that iniated the sort

 const string PIN_FAILED_ERROR_MSG = "Failed to pin block for $0-length data needed "

     "for sorting. Reducing query concurrency or increasing the memory available to "

     "Impala may help running this query.";


 // A run is a sequence of blocks containing tuples that are or will eventually be in

 // sorted order.

 // A run may maintain two sequences of blocks - one containing the tuples themselves,

 // (i.e. fixed-len slots and ptrs to var-len data), and the other for the var-length

 // column data pointed to by those tuples.

 // Tuples in a run may be sorted in place (in-memory) and merged using a merger.

 class Sorter::Run {

  public:

   // materialize_slots is true for runs constructed from input rows. The input rows are

   // materialized into single sort tuples using the expressions in

   // sort_tuple_slot_expr_ctxs_. For intermediate merges, the tuples are already

   // materialized so materialize_slots is false.

   Run(Sorter* parent, TupleDescriptor* sort_tuple_desc, bool materialize_slots);


   // Initialize the run for input rows by allocating the minimum number of required

   // blocks - one block for fixed-len data added to fixed_len_blocks_, one for the

   // initially unsorted var-len data added to var_len_blocks_, and one to copy sorted

   // var-len data into (var_len_copy_block_).

   Status Init();


   // Add a batch of input rows to the current run. Returns the number

   // of rows actually added in num_processed. If the run is full (no more blocks can

   // be allocated), num_processed may be less than the number of rows in the batch.

   // If materialize_slots_ is true, materializes the input rows using the expressions

   // in sorter_->sort_tuple_slot_expr_ctxs_, else just copies the input rows.

   template <bool has_var_len_data>

   Status AddBatch(RowBatch* batch, int start_index, int* num_processed);


   // Unpins all the blocks in a sorted run. Var-length column data is copied into new

   // blocks in sorted order. Pointers in the original tuples are converted to offsets

   // from the beginning of the sequence of var-len data blocks.

   Status UnpinAllBlocks();


   // Deletes all blocks.

   void DeleteAllBlocks();


   // Interface for merger - get the next batch of rows from this run. The callee (Run)

   // still owns the returned batch. Calls GetNext(RowBatch*, bool*).

   Status GetNextBatch(RowBatch** sorted_batch);


  private:

   friend class Sorter;

   friend class TupleSorter;


   // Fill output_batch with rows from this run. If convert_offset_to_ptr is true, offsets

   // in var-length slots are converted back to pointers. Only row pointers are copied

   // into output_batch.

   // If this run was unpinned, one block (2 if there are var-len slots) is pinned while

   // rows are filled into output_batch. The block is unpinned before the next block is

   // pinned. Atmost 1 (2) block(s) will be pinned at any time.

   // If the run was pinned, the blocks are not unpinned (Sorter holds on to the memory).

   // In either case, all rows in output_batch will have their fixed and var-len data from

   // the same block.

   // TODO: If we leave the last run to be merged in memory, the fixed-len blocks can be

   // unpinned as they are consumed.

   template <bool convert_offset_to_ptr>

   Status GetNext(RowBatch* output_batch, bool* eos);


   // Check if a run can be extended by allocating additional blocks from the block

   // manager. Always true when building a sorted run in an intermediate merge, because

   // the current block(s) can be unpinned before getting the next free block (so a block

   // is always available)

   bool CanExtendRun() const;


   // Collect the non-null var-len (e.g. STRING) slots from 'src' in var_slots and return

   // the total length of all var_len slots in total_var_len.

   void CollectNonNullVarSlots(Tuple* src, vector<StringValue*>* var_len_values,

       int* total_var_len);


   // Check if the current run can be extended by a block. Add the newly allocated block

   // to block_sequence, or set added to false if the run could not be extended.

   // If the run is sorted (produced by an intermediate merge), unpin the last block in

   // block_sequence before allocating and adding a new block - the run can always be

   // extended in this case. If the run is unsorted, check max_blocks_in_unsorted_run_

   // to see if a block can be added to the run. Also updates the sort bytes counter.

   Status TryAddBlock(vector<BufferedBlockMgr::Block*>* block_sequence, bool* added);


   // Prepare to read a sorted run. Pins the first block(s) in the run if the run was

   // previously unpinned.

   Status PrepareRead();


   // Copy the StringValue data in var_values to dest in order and update the StringValue

   // ptrs to point to the copied data.

   void CopyVarLenData(char* dest, const vector<StringValue*>& var_values);


   // Copy the StringValue in var_values to dest in order. Update the StringValue ptrs to

   // contain an offset to the copied data. Parameter 'offset' is the offset for the first

   // StringValue.

   void CopyVarLenDataConvertOffset(char* dest, int64_t offset,

       const vector<StringValue*>& var_values);


   // Parent sorter object.

   const Sorter* sorter_;


   // Materialized sort tuple. Input rows are materialized into 1 tuple (with descriptor

   // sort_tuple_desc_) before sorting.

   const TupleDescriptor* sort_tuple_desc_;


   // Sizes of sort tuple and block.

   const int sort_tuple_size_;

   const int block_size_;


   const bool has_var_len_slots_;


   // True if the sort tuple must be materialized from the input batch in AddBatch().

   // materialize_slots_ is true for runs being constructed from input batches, and

   // is false for runs being constructed from intermediate merges.

   const bool materialize_slots_;


   // True if the run is sorted. Set to true after an in-memory sort, and initialized to

   // true for runs resulting from merges.

   bool is_sorted_;


   // True if all blocks in the run are pinned.

   bool is_pinned_;


   // Sequence of blocks in this run containing the fixed-length portion of the sort tuples

   // comprising this run. The data pointed to by the var-len slots are in var_len_blocks_.

   // If is_sorted_ is true, the tuples in fixed_len_blocks_ will be in sorted order.

   // fixed_len_blocks_[i] is NULL iff it has been deleted.

   vector<BufferedBlockMgr::Block*> fixed_len_blocks_;


   // Sequence of blocks in this run containing the var-length data corresponding to the

   // var-length columns from fixed_len_blocks_. These are reconstructed to be in sorted

   // order in UnpinAllBlocks().

   // var_len_blocks_[i] is NULL iff it has been deleted.

   vector<BufferedBlockMgr::Block*> var_len_blocks_;


   // If there are var-len slots, an extra pinned block is used to copy out var-len data

   // into a new sequence of blocks in sorted order. var_len_copy_block_ stores this

   // extra allocated block.

   BufferedBlockMgr::Block* var_len_copy_block_;


   // Number of tuples so far in this run.

   int64_t num_tuples_;


   // Number of tuples returned via GetNext(), maintained for debug purposes.

   int64_t num_tuples_returned_;


   // buffered_batch_ is used to return TupleRows to the merger when this run is being

   // merged. buffered_batch_ is returned in calls to GetNextBatch().

   scoped_ptr<RowBatch> buffered_batch_;


   // Members used when a run is read in GetNext()

   // The index into the fixed_ and var_len_blocks_ vectors of the current blocks being

   // processed in GetNext().

   int fixed_len_blocks_index_;

   int var_len_blocks_index_;


   // If true, pin the next fixed and var-len blocks and delete the previous ones

   // during in the next call to GetNext(). Set during the previous call to GetNext().

   // Not used if a run is already pinned.

   bool pin_next_fixed_len_block_;

   bool pin_next_var_len_block_;


   // Offset into the current fixed length data block being processed.

   int fixed_len_block_offset_;

 }; // class Sorter::Run


 // Sorts a sequence of tuples from a run in place using a provided tuple comparator.

 // Quick sort is used for sequences of tuples larger that 16 elements, and insertion sort

 // is used for smaller sequences. The TupleSorter is initialized with a RuntimeState

 // instance to check for cancellation during an in-memory sort.

 class Sorter::TupleSorter {

  public:

   TupleSorter(const TupleRowComparator& less_than_comp, int64_t block_size,

       int tuple_size, RuntimeState* state);


   ~TupleSorter();


   // Performs a quicksort for tuples in 'run' followed by an insertion sort to

   // finish smaller blocks.

   // Returns early if stste_->is_cancelled() is true. No status

   // is returned - the caller must check for cancellation.

   void Sort(Run* run);


  private:

   static const int INSERTION_THRESHOLD = 16;


   // Helper class used to iterate over tuples in a run during quick sort and insertion

   // sort.

   class TupleIterator {

    public:

     TupleIterator(TupleSorter* parent, int64_t index)

       : parent_(parent),

         index_(index),

         current_tuple_(NULL) {

       DCHECK_GE(index, 0);

       DCHECK_LE(index, parent_->run_->num_tuples_);

       // If the run is empty, only index_ and current_tuple_ are initialized.

       if (parent_->run_->num_tuples_ == 0) return;

       // If the iterator is initialized to past the end, set up buffer_start_ and

       // block_index_ as if it pointing to the last tuple. Add tuple_size_ bytes to

       // current_tuple_, so everything is correct when Prev() is invoked.

       int past_end_bytes = 0;

       if (UNLIKELY(index >= parent_->run_->num_tuples_)) {

         past_end_bytes = parent->tuple_size_;

         index_ = parent_->run_->num_tuples_;

         index = index_ - 1;

       }

       block_index_ = index / parent->block_capacity_;

       buffer_start_ = parent->run_->fixed_len_blocks_[block_index_]->buffer();

       int block_offset = (index % parent->block_capacity_) * parent->tuple_size_;

       current_tuple_ = buffer_start_ + block_offset + past_end_bytes;

     }


     // Sets current_tuple_ to point to the next tuple in the run. Increments

     // block_index and resets buffer if the next tuple is in the next block.

     void Next() {

       current_tuple_ += parent_->tuple_size_;

       ++index_;

       if (UNLIKELY(current_tuple_ > buffer_start_ + parent_->last_tuple_block_offset_ &&

           index_ < parent_->run_->num_tuples_)) {

        // Don't increment block index, etc. past the end.

        ++block_index_;

        DCHECK_LT(block_index_, parent_->run_->fixed_len_blocks_.size());

        buffer_start_ = parent_->run_->fixed_len_blocks_[block_index_]->buffer();

        current_tuple_ = buffer_start_;

       }

     }


     // Sets current_tuple to point to the previous tuple in the run. Decrements

     // block_index and resets buffer if the new tuple is in the previous block.

     void Prev() {

       current_tuple_ -= parent_->tuple_size_;

       --index_;

       if (UNLIKELY(current_tuple_ < buffer_start_ && index_ >= 0)) {

        --block_index_;

        DCHECK_GE(block_index_, 0);

        buffer_start_ = parent_->run_->fixed_len_blocks_[block_index_]->buffer();

        current_tuple_ = buffer_start_ + parent_->last_tuple_block_offset_;

       }

     }


    private:

     friend class TupleSorter;


     // Pointer to the tuple sorter.

     TupleSorter* parent_;


     // Index of the current tuple in the run.

     int64_t index_;


     // Pointer to the current tuple.

     uint8_t* current_tuple_;


     // Start of the buffer containing current tuple.

     uint8_t* buffer_start_;


     // Index into run_.fixed_len_blocks_ of the block containing the current tuple.

     int block_index_;

   };


   // Size of the tuples in memory.

   const int tuple_size_;


   // Number of tuples per block in a run.

   const int block_capacity_;


   // Offset in bytes of the last tuple in a block, calculated from block and tuple sizes.

   const int last_tuple_block_offset_;


   // Tuple comparator that returns true if lhs < rhs.

   const TupleRowComparator less_than_comp_;


   // Runtime state instance to check for cancellation. Not owned.

   RuntimeState* const state_;


   // The run to be sorted.

   Run* run_;


   // Temporarily allocated space to copy and swap tuples (Both are used in Partition()).

   // temp_tuple_ points to temp_tuple_buffer_. Owned by this TupleSorter instance.

   TupleRow* temp_tuple_row_;

   uint8_t* temp_tuple_buffer_;

   uint8_t* swap_buffer_;


   // Perform an insertion sort for rows in the range [first, last) in a run.

   void InsertionSort(const TupleIterator& first, const TupleIterator& last);


   // Partitions the sequence of tuples in the range [first, last) in a run into two groups

   // around the pivot tuple - i.e. tuples in first group are <= the pivot, and tuples in

   // the second group are >= pivot. Tuples are swapped in place to create the groups and

   // the index to the first element in the second group is returned.

   // Checks state_->is_cancelled() and returns early with an invalid result if true.

   TupleIterator Partition(TupleIterator first, TupleIterator last, Tuple* pivot);


   // Performs a quicksort of rows in the range [first, last) followed by insertion sort

   // for smaller groups of elements.

   // Checks state_->is_cancelled() and returns early if true.

   void SortHelper(TupleIterator first, TupleIterator last);


   // Swaps tuples pointed to by left and right using the swap buffer.

   void Swap(uint8_t* left, uint8_t* right);

 }; // class TupleSorter


 // Sorter::Run methods

 Sorter::Run::Run(Sorter* parent, TupleDescriptor* sort_tuple_desc,

     bool materialize_slots)

   : sorter_(parent),

     sort_tuple_desc_(sort_tuple_desc),

     sort_tuple_size_(sort_tuple_desc->byte_size()),

     block_size_(parent->block_mgr_->max_block_size()),

     has_var_len_slots_(sort_tuple_desc->string_slots().size() > 0),

     materialize_slots_(materialize_slots),

     is_sorted_(!materialize_slots),

     is_pinned_(true),

     var_len_copy_block_(NULL),

     num_tuples_(0) {

 }


 Status Sorter::Run::Init() {

   BufferedBlockMgr::Block* block = NULL;

   RETURN_IF_ERROR(

       sorter_->block_mgr_->GetNewBlock(sorter_->block_mgr_client_, NULL, &block));

   DCHECK_NOTNULL(block);

   fixed_len_blocks_.push_back(block);

   if (has_var_len_slots_) {

     RETURN_IF_ERROR(

         sorter_->block_mgr_->GetNewBlock(sorter_->block_mgr_client_, NULL, &block));

     DCHECK_NOTNULL(block);

     var_len_blocks_.push_back(block);

     if (!is_sorted_) {

       RETURN_IF_ERROR(sorter_->block_mgr_->GetNewBlock(

           sorter_->block_mgr_client_, NULL, &var_len_copy_block_));

       DCHECK_NOTNULL(var_len_copy_block_);

     }

   }

   if (!is_sorted_) sorter_->initial_runs_counter_->Add(1);

   return Status::OK;

 }


 template <bool has_var_len_data>

 Status Sorter::Run::AddBatch(RowBatch* batch, int start_index, int* num_processed) {

   DCHECK(!fixed_len_blocks_.empty());

   *num_processed = 0;

   BufferedBlockMgr::Block* cur_fixed_len_block = fixed_len_blocks_.back();


   DCHECK_EQ(materialize_slots_, !is_sorted_);

   if (!materialize_slots_) {

     // If materialize slots is false the run is being constructed for an

     // intermediate merge and the sort tuples have already been materialized.

     // The input row should have the same schema as the sort tuples.

     DCHECK_EQ(batch->row_desc().tuple_descriptors().size(), 1);

     DCHECK_EQ(batch->row_desc().tuple_descriptors()[0], sort_tuple_desc_);

   }


   // Input rows are copied/materialized into tuples allocated in fixed_len_blocks_.

   // The variable length column data are copied into blocks stored in var_len_blocks_.

   // Input row processing is split into two loops.

   // The inner loop processes as many input rows as will fit in cur_fixed_len_block.

   // The outer loop allocates a new block for fixed-len data if the input batch is

   // not exhausted.


   // cur_input_index is the index into the input 'batch' of the current input row being

   // processed.

   int cur_input_index = start_index;

   vector<StringValue*> var_values;

   var_values.reserve(sort_tuple_desc_->string_slots().size());

   while (cur_input_index < batch->num_rows()) {

     // tuples_remaining is the number of tuples to copy/materialize into

     // cur_fixed_len_block.

     int tuples_remaining = cur_fixed_len_block->BytesRemaining() / sort_tuple_size_;

     tuples_remaining = min(batch->num_rows() - cur_input_index, tuples_remaining);


     for (int i = 0; i < tuples_remaining; ++i) {

       int total_var_len = 0;

       TupleRow* input_row = batch->GetRow(cur_input_index);

       Tuple* new_tuple = cur_fixed_len_block->Allocate<Tuple>(sort_tuple_size_);

       if (materialize_slots_) {

         new_tuple->MaterializeExprs<has_var_len_data>(input_row, *sort_tuple_desc_,

             sorter_->sort_tuple_slot_expr_ctxs_, NULL, &var_values, &total_var_len);

         if (total_var_len > sorter_->block_mgr_->max_block_size()) {

           return Status(ErrorMsg(TErrorCode::INTERNAL_ERROR, Substitute(

               "Variable length data in a single tuple larger than block size $0 > $1",

               total_var_len, sorter_->block_mgr_->max_block_size())));

         }

       } else {

         memcpy(new_tuple, input_row->GetTuple(0), sort_tuple_size_);

         if (has_var_len_data) {

           CollectNonNullVarSlots(new_tuple, &var_values, &total_var_len);

         }

       }


       if (has_var_len_data) {

         DCHECK_GT(var_len_blocks_.size(), 0);

         BufferedBlockMgr::Block* cur_var_len_block = var_len_blocks_.back();

         if (cur_var_len_block->BytesRemaining() < total_var_len) {

           bool added;

           RETURN_IF_ERROR(TryAddBlock(&var_len_blocks_, &added));

           if (added) {

             cur_var_len_block = var_len_blocks_.back();

           } else {

             // There was not enough space in the last var-len block for this tuple, and

             // the run could not be extended. Return the fixed-len allocation and exit.

             cur_fixed_len_block->ReturnAllocation(sort_tuple_size_);

             return Status::OK;

           }

         }


         char* var_data_ptr = cur_var_len_block->Allocate<char>(total_var_len);

         if (materialize_slots_) {

           CopyVarLenData(var_data_ptr, var_values);

         } else {

           int64_t offset = (var_len_blocks_.size() - 1) * block_size_;

           offset += var_data_ptr - reinterpret_cast<char*>(cur_var_len_block->buffer());

           CopyVarLenDataConvertOffset(var_data_ptr, offset, var_values);

         }

       }

       ++num_tuples_;

       ++*num_processed;

       ++cur_input_index;

     }


     // If there are still rows left to process, get a new block for the fixed-length

     // tuples. If the run is already too long, return.

     if (cur_input_index < batch->num_rows()) {

       bool added;

       RETURN_IF_ERROR(TryAddBlock(&fixed_len_blocks_, &added));

       if (added) {

         cur_fixed_len_block = fixed_len_blocks_.back();

       } else {

         return Status::OK;

       }

     }

   }

   return Status::OK;

 }


 void Sorter::Run::DeleteAllBlocks() {

   BOOST_FOREACH(BufferedBlockMgr::Block* block, fixed_len_blocks_) {

     if (block != NULL) block->Delete();

   }

   BOOST_FOREACH(BufferedBlockMgr::Block* block, var_len_blocks_) {

     if (block != NULL) block->Delete();

   }

   if (var_len_copy_block_ != NULL) var_len_copy_block_->Delete();

 }


 Status Sorter::Run::UnpinAllBlocks() {

   vector<BufferedBlockMgr::Block*> sorted_var_len_blocks;

   sorted_var_len_blocks.reserve(var_len_blocks_.size());

   vector<StringValue*> var_values;

   int64_t var_data_offset = 0;

   int total_var_len;

   var_values.reserve(sort_tuple_desc_->string_slots().size());

   BufferedBlockMgr::Block* cur_sorted_var_len_block = NULL;

   if (has_var_len_slots_ && var_len_blocks_.size() > 0) {

     DCHECK_NOTNULL(var_len_copy_block_);

     sorted_var_len_blocks.push_back(var_len_copy_block_);

     cur_sorted_var_len_block = sorted_var_len_blocks.back();

   } else {

     DCHECK(var_len_copy_block_ == NULL);

   }


   for (int i = 0; i < fixed_len_blocks_.size(); ++i) {

     BufferedBlockMgr::Block* cur_fixed_block = fixed_len_blocks_[i];

     if (has_var_len_slots_) {

       for (int block_offset = 0; block_offset < cur_fixed_block->valid_data_len();

           block_offset += sort_tuple_size_) {

         Tuple* cur_tuple =

             reinterpret_cast<Tuple*>(cur_fixed_block->buffer() + block_offset);

         CollectNonNullVarSlots(cur_tuple, &var_values, &total_var_len);

         if (cur_sorted_var_len_block->BytesRemaining() < total_var_len) {

           bool added;

           RETURN_IF_ERROR(TryAddBlock(&sorted_var_len_blocks, &added));

           DCHECK(added);

           cur_sorted_var_len_block = sorted_var_len_blocks.back();

         }

         char* var_data_ptr = cur_sorted_var_len_block->Allocate<char>(total_var_len);

         var_data_offset = block_size_ * (sorted_var_len_blocks.size() - 1) +

             (var_data_ptr - reinterpret_cast<char*>(cur_sorted_var_len_block->buffer()));

         CopyVarLenDataConvertOffset(var_data_ptr, var_data_offset, var_values);

       }

     }

     RETURN_IF_ERROR(cur_fixed_block->Unpin());

   }


   if (has_var_len_slots_ && var_len_blocks_.size() > 0) {

     DCHECK_GT(sorted_var_len_blocks.back()->valid_data_len(), 0);

     RETURN_IF_ERROR(sorted_var_len_blocks.back()->Unpin());

   }


   // Clear var_len_blocks_ and replace with it with the contents of sorted_var_len_blocks

   BOOST_FOREACH(BufferedBlockMgr::Block* var_block, var_len_blocks_) {

     RETURN_IF_ERROR(var_block->Delete());

   }

   var_len_blocks_.clear();

   sorted_var_len_blocks.swap(var_len_blocks_);

   // Set var_len_copy_block_ to NULL since it's now in var_len_blocks_ and is no longer

   // needed.

   var_len_copy_block_ = NULL;

   is_pinned_ = false;

   return Status::OK;

 }


 Status Sorter::Run::PrepareRead() {

   fixed_len_blocks_index_ = 0;

   fixed_len_block_offset_ = 0;

   var_len_blocks_index_ = 0;

   pin_next_fixed_len_block_ = pin_next_var_len_block_ = false;

   num_tuples_returned_ = 0;


   buffered_batch_.reset(new RowBatch(*sorter_->output_row_desc_,

       sorter_->state_->batch_size(), sorter_->mem_tracker_));


   // If the run is pinned, merge is not invoked, so buffered_batch_ is not needed

   // and the individual blocks do not need to be pinned.

   if (is_pinned_) return Status::OK;


   // Attempt to pin the first fixed and var-length blocks. In either case, pinning may

   // fail if the number of reserved blocks is oversubscribed, see IMPALA-1590.

   if (fixed_len_blocks_.size() > 0) {

     bool pinned = false;

     RETURN_IF_ERROR(fixed_len_blocks_[0]->Pin(&pinned));

     if (!pinned) {

       Status status = Status::MEM_LIMIT_EXCEEDED;

       status.AddDetail(Substitute(PIN_FAILED_ERROR_MSG, "fixed"));

       return status;

     }

   }


   if (has_var_len_slots_ && var_len_blocks_.size() > 0) {

     bool pinned = false;

     RETURN_IF_ERROR(var_len_blocks_[0]->Pin(&pinned));

     if (!pinned) {

       Status status = Status::MEM_LIMIT_EXCEEDED;

       status.AddDetail(Substitute(PIN_FAILED_ERROR_MSG, "variable"));

       return status;

     }

   }

   return Status::OK;

 }


 Status Sorter::Run::GetNextBatch(RowBatch** output_batch) {

   if (buffered_batch_.get() != NULL) {

     buffered_batch_->Reset();

     // Fill more rows into buffered_batch_.

     bool eos;

     if (has_var_len_slots_ && !is_pinned_) {

       RETURN_IF_ERROR(GetNext<true>(buffered_batch_.get(), &eos));

       if (buffered_batch_->num_rows() == 0 && !eos) {

         // No rows were filled because GetNext() had to read the next var-len block

         // Call GetNext() again.

         RETURN_IF_ERROR(GetNext<true>(buffered_batch_.get(), &eos));

       }

     } else {

       RETURN_IF_ERROR(GetNext<false>(buffered_batch_.get(), &eos));

     }

     DCHECK(eos || buffered_batch_->num_rows() > 0);

     if (eos) {

       // No rows are filled in GetNext() on eos, so this is safe.

       DCHECK_EQ(buffered_batch_->num_rows(), 0);

       buffered_batch_.reset();

       // The merge is complete. Delete the last blocks in the run.

       RETURN_IF_ERROR(fixed_len_blocks_.back()->Delete());

       fixed_len_blocks_[fixed_len_blocks_.size() - 1] = NULL;

       if (has_var_len_slots_) {

         RETURN_IF_ERROR(var_len_blocks_.back()->Delete());

         var_len_blocks_[var_len_blocks_.size() - 1] = NULL;

       }

     }

   }


   // *output_batch == NULL indicates eos.

   *output_batch = buffered_batch_.get();

   return Status::OK;

 }


 template <bool convert_offset_to_ptr>

 Status Sorter::Run::GetNext(RowBatch* output_batch, bool* eos) {

   if (fixed_len_blocks_index_ == fixed_len_blocks_.size()) {

     *eos = true;

     DCHECK_EQ(num_tuples_returned_, num_tuples_);

     return Status::OK;

   } else {

     *eos = false;

   }


   BufferedBlockMgr::Block* fixed_len_block = fixed_len_blocks_[fixed_len_blocks_index_];


   if (!is_pinned_) {

     // Pin the next block and delete the previous if set in the previous call to

     // GetNext().

     if (pin_next_fixed_len_block_) {

       RETURN_IF_ERROR(fixed_len_blocks_[fixed_len_blocks_index_ - 1]->Delete());

       fixed_len_blocks_[fixed_len_blocks_index_ - 1] = NULL;

       bool pinned;

       RETURN_IF_ERROR(fixed_len_block->Pin(&pinned));

       DCHECK(pinned);

       pin_next_fixed_len_block_ = false;

     }

     if (pin_next_var_len_block_) {

       RETURN_IF_ERROR(var_len_blocks_[var_len_blocks_index_ - 1]->Delete());

       var_len_blocks_[var_len_blocks_index_ - 1] = NULL;

       bool pinned;

       RETURN_IF_ERROR(var_len_blocks_[var_len_blocks_index_]->Pin(&pinned));

       DCHECK(pinned);

       pin_next_var_len_block_ = false;

     }

   }


   // GetNext fills rows into the output batch until a block boundary is reached.

   while (!output_batch->AtCapacity() &&

       fixed_len_block_offset_ < fixed_len_block->valid_data_len()) {

     Tuple* input_tuple = reinterpret_cast<Tuple*>(

         fixed_len_block->buffer() + fixed_len_block_offset_);


     if (convert_offset_to_ptr) {

       // Convert the offsets in the var-len slots in input_tuple back to pointers.

       const vector<SlotDescriptor*>& var_slots = sort_tuple_desc_->string_slots();

       for (int i = 0; i < var_slots.size(); ++i) {

         SlotDescriptor* slot_desc = var_slots[i];

         if (input_tuple->IsNull(slot_desc->null_indicator_offset())) continue;


         DCHECK_EQ(slot_desc->type().type, TYPE_STRING);

         StringValue* value = reinterpret_cast<StringValue*>(

             input_tuple->GetSlot(slot_desc->tuple_offset()));

         int64_t data_offset = reinterpret_cast<int64_t>(value->ptr);


         // data_offset is an offset in bytes from the beginning of the first block

         // in var_len_blocks_. Convert it into an index into var_len_blocks_ and an

         // offset within that block.

         int block_index = data_offset / block_size_;

         int block_offset = data_offset % block_size_;


         if (block_index > var_len_blocks_index_) {

           // We've reached the block boundary for the current var-len block.

           // This tuple will be returned in the next call to GetNext().

           DCHECK_EQ(block_index, var_len_blocks_index_ + 1);

           DCHECK_EQ(block_offset, 0);

           DCHECK_EQ(i, 0);

           var_len_blocks_index_ = block_index;

           pin_next_var_len_block_ = true;

           break;

         } else {

           DCHECK_EQ(block_index, var_len_blocks_index_);

           // Calculate the address implied by the offset and assign it.

           value->ptr = reinterpret_cast<char*>(

               var_len_blocks_[var_len_blocks_index_]->buffer() + block_offset);

         } // if (block_index > var_len_blocks_index_)

       } // for (int i = 0; i < var_slots.size(); ++i)


       // The var-len data is in the next block, so end this call to GetNext().

       if (pin_next_var_len_block_) break;

     } // if (convert_offset_to_ptr)


     int output_row_idx = output_batch->AddRow();

     output_batch->GetRow(output_row_idx)->SetTuple(0, input_tuple);

     output_batch->CommitLastRow();

     fixed_len_block_offset_ += sort_tuple_size_;

     ++num_tuples_returned_;

   }


   if (fixed_len_block_offset_ >= fixed_len_block->valid_data_len()) {

     pin_next_fixed_len_block_ = true;

     ++fixed_len_blocks_index_;

     fixed_len_block_offset_ = 0;

   }


   return Status::OK;

 }


 void Sorter::Run::CollectNonNullVarSlots(Tuple* src,

     vector<StringValue*>* var_len_values, int* total_var_len) {

   var_len_values->clear();

   *total_var_len = 0;

   BOOST_FOREACH(const SlotDescriptor* var_slot, sort_tuple_desc_->string_slots()) {

     if (!src->IsNull(var_slot->null_indicator_offset())) {

       StringValue* string_val =

           reinterpret_cast<StringValue*>(src->GetSlot(var_slot->tuple_offset()));

       var_len_values->push_back(string_val);

       *total_var_len += string_val->len;

     }

   }

 }


 Status Sorter::Run::TryAddBlock(vector<BufferedBlockMgr::Block*>* block_sequence,

     bool* added) {

   DCHECK(!block_sequence->empty());

   BufferedBlockMgr::Block* last_block = block_sequence->back();

   if (!is_sorted_) {

     sorter_->sorted_data_size_->Add(last_block->valid_data_len());

     last_block = NULL;

   } else {

     // If the run is sorted, we will unpin the last block and extend the run.

   }


   BufferedBlockMgr::Block* new_block;

   RETURN_IF_ERROR(sorter_->block_mgr_->GetNewBlock(

       sorter_->block_mgr_client_, last_block, &new_block));

   if (new_block != NULL) {

     *added = true;

     block_sequence->push_back(new_block);

   } else {

     *added = false;

   }

   return Status::OK;

 }


 void Sorter::Run::CopyVarLenData(char* dest, const vector<StringValue*>& var_values) {

   BOOST_FOREACH(StringValue* var_val, var_values) {

     memcpy(dest, var_val->ptr, var_val->len);

     var_val->ptr = dest;

     dest += var_val->len;

   }

 }


 void Sorter::Run::CopyVarLenDataConvertOffset(char* dest, int64_t offset,

     const vector<StringValue*>& var_values) {

   BOOST_FOREACH(StringValue* var_val, var_values) {

     memcpy(dest, var_val->ptr, var_val->len);

     var_val->ptr = reinterpret_cast<char*>(offset);

     dest += var_val->len;

     offset += var_val->len;

   }

 }


 // Sorter::TupleSorter methods.

 Sorter::TupleSorter::TupleSorter(const TupleRowComparator& comp, int64_t block_size,

     int tuple_size, RuntimeState* state)

   : tuple_size_(tuple_size),

     block_capacity_(block_size / tuple_size),

     last_tuple_block_offset_(tuple_size * ((block_size / tuple_size) - 1)),

     less_than_comp_(comp),

     state_(state) {

   temp_tuple_buffer_ = new uint8_t[tuple_size];

   temp_tuple_row_ = reinterpret_cast<TupleRow*>(&temp_tuple_buffer_);

   swap_buffer_ = new uint8_t[tuple_size];

 }


 Sorter::TupleSorter::~TupleSorter() {

   delete[] temp_tuple_buffer_;

   delete[] swap_buffer_;

 }


 void Sorter::TupleSorter::Sort(Run* run) {

   run_ = run;

   SortHelper(TupleIterator(this, 0), TupleIterator(this, run_->num_tuples_));

   run->is_sorted_ = true;

 }


 // Sort the sequence of tuples from [first, last).

 // Begin with a sorted sequence of size 1 [first, first+1).

 // During each pass of the outermost loop, add the next tuple (at position 'i') to

 // the sorted sequence by comparing it to each element of the sorted sequence

 // (reverse order) to find its correct place in the sorted sequence, copying tuples

 // along the way.

 void Sorter::TupleSorter::InsertionSort(const TupleIterator& first,

     const TupleIterator& last) {

   TupleIterator insert_iter = first;

   insert_iter.Next();

   for (; insert_iter.index_ < last.index_; insert_iter.Next()) {

     // insert_iter points to the tuple after the currently sorted sequence that must

     // be inserted into the sorted sequence. Copy to temp_tuple_row_ since it may be

     // overwritten by the one at position 'insert_iter - 1'

     memcpy(temp_tuple_buffer_, insert_iter.current_tuple_, tuple_size_);


     // 'iter' points to the tuple that temp_tuple_row_ will be compared to.

     // 'copy_to' is the where iter should be copied to if it is >= temp_tuple_row_.

     // copy_to always to the next row after 'iter'

     TupleIterator iter = insert_iter;

     iter.Prev();

     uint8_t* copy_to = insert_iter.current_tuple_;

     while (less_than_comp_(temp_tuple_row_,

         reinterpret_cast<TupleRow*>(&iter.current_tuple_))) {

       memcpy(copy_to, iter.current_tuple_, tuple_size_);

       copy_to = iter.current_tuple_;

       // Break if 'iter' has reached the first row, meaning that temp_tuple_row_

       // will be inserted in position 'first'

       if (iter.index_ <= first.index_) break;

       iter.Prev();

     }


     memcpy(copy_to, temp_tuple_buffer_, tuple_size_);

   }

 }


 Sorter::TupleSorter::TupleIterator Sorter::TupleSorter::Partition(TupleIterator first,

     TupleIterator last, Tuple* pivot) {

   // Copy pivot into temp_tuple since it points to a tuple within [first, last).

   memcpy(temp_tuple_buffer_, pivot, tuple_size_);


   last.Prev();

   while (true) {

     // Search for the first and last out-of-place elements, and swap them.

     while (less_than_comp_(reinterpret_cast<TupleRow*>(&first.current_tuple_),

         temp_tuple_row_)) {

       first.Next();

     }

     while (less_than_comp_(temp_tuple_row_,

         reinterpret_cast<TupleRow*>(&last.current_tuple_))) {

       last.Prev();

     }


     if (first.index_ >= last.index_) break;

     // Swap first and last tuples.

     Swap(first.current_tuple_, last.current_tuple_);


     first.Next();

     last.Prev();

   }


   return first;

 }


 void Sorter::TupleSorter::SortHelper(TupleIterator first, TupleIterator last) {

   if (UNLIKELY(state_->is_cancelled())) return;

   // Use insertion sort for smaller sequences.

   while (last.index_ - first.index_ > INSERTION_THRESHOLD) {

     TupleIterator iter(this, first.index_ + (last.index_ - first.index_)/2);

     DCHECK_NOTNULL(iter.current_tuple_);

     // Partition() splits the tuples in [first, last) into two groups (<= pivot

     // and >= pivot) in-place. 'cut' is the index of the first tuple in the second group.

     TupleIterator cut = Partition(first, last,

         reinterpret_cast<Tuple*>(iter.current_tuple_));

     SortHelper(cut, last);

     last = cut;

     if (UNLIKELY(state_->is_cancelled())) return;

   }


   InsertionSort(first, last);

 }


 inline void Sorter::TupleSorter::Swap(uint8_t* left, uint8_t* right) {

   memcpy(swap_buffer_, left, tuple_size_);

   memcpy(left, right, tuple_size_);

   memcpy(right, swap_buffer_, tuple_size_);

 }


 // Sorter methods

 Sorter::Sorter(const TupleRowComparator& compare_less_than,

     const vector<ExprContext*>& slot_materialize_expr_ctxs,

     RowDescriptor* output_row_desc, MemTracker* mem_tracker,

     RuntimeProfile* profile, RuntimeState* state)

   : state_(state),

     compare_less_than_(compare_less_than),

     block_mgr_(state->block_mgr()),

     unsorted_run_(NULL),

     output_row_desc_(output_row_desc),

     sort_tuple_slot_expr_ctxs_(slot_materialize_expr_ctxs),

     mem_tracker_(mem_tracker),

     profile_(profile) {

 }


 Sorter::~Sorter() {

   // Delete all blocks from the block mgr.

   for (list<Run*>::iterator it = sorted_runs_.begin(); it != sorted_runs_.end(); ++it) {

     (*it)->DeleteAllBlocks();

   }

   for (list<Run*>::iterator it = merging_runs_.begin(); it != merging_runs_.end(); ++it) {

     (*it)->DeleteAllBlocks();

   }

   if (unsorted_run_ != NULL) unsorted_run_->DeleteAllBlocks();

   block_mgr_->ClearReservations(block_mgr_client_);

 }


 Status Sorter::Init() {

   DCHECK(unsorted_run_ == NULL) << "Already initialized";

   TupleDescriptor* sort_tuple_desc = output_row_desc_->tuple_descriptors()[0];

   has_var_len_slots_ = sort_tuple_desc->string_slots().size() > 0;

   in_mem_tuple_sorter_.reset(new TupleSorter(compare_less_than_,

       block_mgr_->max_block_size(), sort_tuple_desc->byte_size(), state_));

   unsorted_run_ = obj_pool_.Add(new Run(this, sort_tuple_desc, true));


   initial_runs_counter_ = ADD_COUNTER(profile_, "InitialRunsCreated", TUnit::UNIT);

   num_merges_counter_ = ADD_COUNTER(profile_, "TotalMergesPerformed", TUnit::UNIT);

   in_mem_sort_timer_ = ADD_TIMER(profile_, "InMemorySortTime");

   sorted_data_size_ = ADD_COUNTER(profile_, "SortDataSize", TUnit::BYTES);


   int min_blocks_required = BLOCKS_REQUIRED_FOR_MERGE;

   // Fixed and var-length blocks are separate, so we need BLOCKS_REQUIRED_FOR_MERGE

   // blocks for both if there is var-length data.

   if (output_row_desc_->tuple_descriptors()[0]->string_slots().size() > 0) {

     min_blocks_required *= 2;

   }

   RETURN_IF_ERROR(block_mgr_->RegisterClient(min_blocks_required, mem_tracker_, state_,

       &block_mgr_client_));


   DCHECK_NOTNULL(unsorted_run_);

   RETURN_IF_ERROR(unsorted_run_->Init());

   return Status::OK;

 }


 Status Sorter::AddBatch(RowBatch* batch) {

   DCHECK_NOTNULL(unsorted_run_);

   DCHECK_NOTNULL(batch);

   int num_processed = 0;

   int cur_batch_index = 0;

   while (cur_batch_index < batch->num_rows()) {

     if (has_var_len_slots_) {

       RETURN_IF_ERROR(unsorted_run_->AddBatch<true>(

           batch, cur_batch_index, &num_processed));

     } else {

       RETURN_IF_ERROR(unsorted_run_->AddBatch<false>(

           batch, cur_batch_index, &num_processed));

     }

     cur_batch_index += num_processed;

     if (cur_batch_index < batch->num_rows()) {

       // The current run is full. Sort it and begin the next one.

       RETURN_IF_ERROR(SortRun());

       RETURN_IF_ERROR(sorted_runs_.back()->UnpinAllBlocks());

       unsorted_run_ = obj_pool_.Add(

           new Run(this, output_row_desc_->tuple_descriptors()[0], true));

       unsorted_run_->Init();

     }

   }

   return Status::OK;

 }


 Status Sorter::InputDone() {

   // Sort the tuples accumulated so far in the current run.

   RETURN_IF_ERROR(SortRun());


   if (sorted_runs_.size() == 1) {

     // The entire input fit in one run. Read sorted rows in GetNext() directly

     // from the sorted run.

     sorted_runs_.back()->PrepareRead();

   } else {

     // At least one merge is necessary.

     int blocks_per_run = has_var_len_slots_ ? 2 : 1;

     int min_buffers_for_merge = sorted_runs_.size() * blocks_per_run;

     // Check if the final run needs to be unpinned.

     bool unpinned_final = false;

     if (block_mgr_->num_free_buffers() < min_buffers_for_merge - blocks_per_run) {

       // Number of available buffers is less than the size of the final run and

       // the buffers needed to read the remainder of the runs in memory.

       // Unpin the final run.

       RETURN_IF_ERROR(sorted_runs_.back()->UnpinAllBlocks());

       unpinned_final = true;

     } else {

       // No need to unpin the current run. There is enough memory to stream the

       // other runs.

       // TODO: revisit. It might be better to unpin some from this run if it means

       // we can get double buffering in the other runs.

     }


     // For an intermediate merge, intermediate_merge_batch contains deep-copied rows from

     // the input runs. If (unmerged_sorted_runs_.size() > max_runs_per_final_merge),

     // one or more intermediate merges are required.

     // TODO: Attempt to allocate more memory before doing intermediate merges. This may

     // be possible if other operators have relinquished memory after the sort has built

     // its runs.

     if (min_buffers_for_merge > block_mgr_->available_allocated_buffers()) {

       DCHECK(unpinned_final);

       RETURN_IF_ERROR(MergeIntermediateRuns());

     }


     // Create the final merger.

     CreateMerger(sorted_runs_.size());

   }

   return Status::OK;

 }


 Status Sorter::GetNext(RowBatch* output_batch, bool* eos) {

   if (sorted_runs_.size() == 1) {

     DCHECK(sorted_runs_.back()->is_pinned_);

     // In this case, only TupleRows are copied into output_batch. Sorted tuples are left

     // in the pinned blocks in the single sorted run.

     RETURN_IF_ERROR(sorted_runs_.back()->GetNext<false>(output_batch, eos));

   } else {

     // In this case, rows are deep copied into output_batch.

     RETURN_IF_ERROR(merger_->GetNext(output_batch, eos));

   }

   return Status::OK;

 }


 Status Sorter::SortRun() {

   BufferedBlockMgr::Block* last_block = unsorted_run_->fixed_len_blocks_.back();

   if (last_block->valid_data_len() > 0) {

     sorted_data_size_->Add(last_block->valid_data_len());

   } else {

     RETURN_IF_ERROR(last_block->Delete());

     unsorted_run_->fixed_len_blocks_.pop_back();

   }

   if (has_var_len_slots_) {

     DCHECK_NOTNULL(unsorted_run_->var_len_copy_block_);

     last_block = unsorted_run_->var_len_blocks_.back();

     if (last_block->valid_data_len() > 0) {

       sorted_data_size_->Add(last_block->valid_data_len());

     } else {

       RETURN_IF_ERROR(last_block->Delete());

       unsorted_run_->var_len_blocks_.pop_back();

       if (unsorted_run_->var_len_blocks_.size() == 0) {

         RETURN_IF_ERROR(unsorted_run_->var_len_copy_block_->Delete());

         unsorted_run_->var_len_copy_block_ = NULL;

       }

     }

   }

   {

     SCOPED_TIMER(in_mem_sort_timer_);

     in_mem_tuple_sorter_->Sort(unsorted_run_);

     RETURN_IF_CANCELLED(state_);

   }

   sorted_runs_.push_back(unsorted_run_);

   unsorted_run_ = NULL;

   return Status::OK;

 }


 uint64_t Sorter::EstimateMergeMem(uint64_t available_blocks,

     RowDescriptor* row_desc, int merge_batch_size) {

   bool has_var_len_slots = row_desc->tuple_descriptors()[0]->string_slots().size() > 0;

   int blocks_per_run = has_var_len_slots ? 2 : 1;

   int max_input_runs_per_merge = (available_blocks / blocks_per_run) - 1;

   // During a merge, the batches corresponding to the input runs contain only TupleRows.

   // (The data itself is in pinned blocks held by the run)

   uint64_t input_batch_mem =

       merge_batch_size * sizeof(Tuple*) * max_input_runs_per_merge;

   // Since rows are deep copied into the output batch for the merger, use a pessimistic

   // estimate of the memory required.

   uint64_t output_batch_mem = RowBatch::AT_CAPACITY_MEM_USAGE;


   return input_batch_mem + output_batch_mem;

 }


 Status Sorter::MergeIntermediateRuns() {

   int blocks_per_run = has_var_len_slots_ ? 2 : 1;

   int max_runs_per_final_merge =

       block_mgr_->available_allocated_buffers() / blocks_per_run;


   // During an intermediate merge, blocks from the output sorted run will have to be

   // pinned.

   int max_runs_per_intermediate_merge = max_runs_per_final_merge - 1;

   DCHECK_GT(max_runs_per_intermediate_merge, 1);

   // For an intermediate merge, intermediate_merge_batch contains deep-copied rows from

   // the input runs. If (sorted_runs_.size() > max_runs_per_final_merge),

   // one or more intermediate merges are required.

   scoped_ptr<RowBatch> intermediate_merge_batch;

   while (sorted_runs_.size() > max_runs_per_final_merge) {

     // An intermediate merge adds one merge to unmerged_sorted_runs_.

     // Merging 'runs - (max_runs_final_ - 1)' number of runs is sifficient to guarantee

     // that the final merge can be performed.

     int num_runs_to_merge = min<int>(max_runs_per_intermediate_merge,

         sorted_runs_.size() - max_runs_per_intermediate_merge);

     CreateMerger(num_runs_to_merge);

     RowBatch intermediate_merge_batch(*output_row_desc_, state_->batch_size(),

         mem_tracker_);

     // merged_run is the new sorted run that is produced by the intermediate merge.

     Run* merged_run = obj_pool_.Add(

         new Run(this, output_row_desc_->tuple_descriptors()[0], false));

     RETURN_IF_ERROR(merged_run->Init());

     bool eos = false;

     while (!eos) {

       // Copy rows into the new run until done.

       int num_copied;

       RETURN_IF_CANCELLED(state_);

       RETURN_IF_ERROR(merger_->GetNext(&intermediate_merge_batch, &eos));

       Status ret_status;

       if (has_var_len_slots_) {

         ret_status = merged_run->AddBatch<true>(&intermediate_merge_batch,

             0, &num_copied);

       } else {

         ret_status = merged_run->AddBatch<false>(&intermediate_merge_batch,

             0, &num_copied);

       }

       if (!ret_status.ok()) return ret_status;


       DCHECK_EQ(num_copied, intermediate_merge_batch.num_rows());

       intermediate_merge_batch.Reset();

     }


     BufferedBlockMgr::Block* last_block = merged_run->fixed_len_blocks_.back();

     if (last_block->valid_data_len() > 0) {

       RETURN_IF_ERROR(last_block->Unpin());

     } else {

       RETURN_IF_ERROR(last_block->Delete());

       merged_run->fixed_len_blocks_.pop_back();

     }

     if (has_var_len_slots_) {

       last_block = merged_run->var_len_blocks_.back();

       if (last_block->valid_data_len() > 0) {

         RETURN_IF_ERROR(last_block->Unpin());

       } else {

         RETURN_IF_ERROR(last_block->Delete());

         merged_run->var_len_blocks_.pop_back();

       }

     }

     merged_run->is_pinned_ = false;

     sorted_runs_.push_back(merged_run);

   }


   return Status::OK;

 }


 Status Sorter::CreateMerger(int num_runs) {

   DCHECK_GT(num_runs, 1);


   // Clean up the runs from the previous merge.

   for (list<Run*>::iterator it = merging_runs_.begin(); it != merging_runs_.end(); ++it) {

     (*it)->DeleteAllBlocks();

   }

   merging_runs_.clear();

   merger_.reset(

       new SortedRunMerger(compare_less_than_, output_row_desc_, profile_, true));


   vector<function<Status (RowBatch**)> > merge_runs;

   merge_runs.reserve(num_runs);

   for (int i = 0; i < num_runs; ++i) {

     Run* run = sorted_runs_.front();

     run->PrepareRead();

     // Run::GetNextBatch() is used by the merger to retrieve a batch of rows to merge

     // from this run.

     merge_runs.push_back(bind<Status>(mem_fn(&Run::GetNextBatch), run, _1));

     sorted_runs_.pop_front();

     merging_runs_.push_back(run);

   }

   RETURN_IF_ERROR(merger_->Prepare(merge_runs));


   num_merges_counter_->Add(1);

   return Status::OK;

 }


 } // namespace impala

row-batch.h

impala::Sorter::TupleSorter::InsertionSort
void InsertionSort(const TupleIterator &first, const TupleIterator &last)
Definition: sorter.cc:787

impala::TupleDescriptor
Definition: descriptors.h:298

impala::Sorter::sort_tuple_slot_expr_ctxs_
std::vector< ExprContext * > sort_tuple_slot_expr_ctxs_
Definition: sorter.h:177

impala::Sorter::num_merges_counter_
RuntimeProfile::Counter * num_merges_counter_
Definition: sorter.h:197

impala::Sorter::TupleSorter::Sort
void Sort(Run *run)
Definition: sorter.cc:775

impala::Sorter::TupleSorter::less_than_comp_
const TupleRowComparator less_than_comp_
Definition: sorter.cc:302

impala::Sorter::Run::num_tuples_
int64_t num_tuples_
Definition: sorter.cc:173

impala::Sorter::Run::AddBatch
Status AddBatch(RowBatch *batch, int start_index, int *num_processed)
Definition: sorter.cc:372

impala::Sorter::Run::CopyVarLenData
void CopyVarLenData(char *dest, const vector< StringValue * > &var_values)
Definition: sorter.cc:739

impala::Sorter::Run::GetNextBatch
Status GetNextBatch(RowBatch **sorted_batch)
Definition: sorter.cc:573

impala::RowBatch::num_rows
int num_rows() const
Definition: row-batch.h:215

impala::Sorter::SortRun
Status SortRun()
Definition: sorter.cc:1006

impala::Sorter::EstimateMergeMem
static uint64_t EstimateMergeMem(uint64_t available_blocks, RowDescriptor *row_desc, int merge_batch_size)
Definition: sorter.cc:1038

impala::TupleRowComparator
Definition: tuple-row-compare.h:27

runtime-profile.h

impala::Sorter::TupleSorter::run_
Run * run_
Definition: sorter.cc:308

sorter.h

impala::StringValue
Definition: string-value.h:33

impala::TupleRow::GetTuple
Tuple * GetTuple(int tuple_idx)
Definition: tuple-row.h:30

impala::GetNext
Status GetNext(RowBatch **batch, RuntimeState *state)

impala::Sorter::Run::has_var_len_slots_
const bool has_var_len_slots_
Definition: sorter.cc:141

impala::Sorter::TupleSorter::~TupleSorter
~TupleSorter()
Definition: sorter.cc:770

impala::Sorter::TupleSorter::TupleIterator::Prev
void Prev()
Definition: sorter.cc:262

impala::Sorter::Run::var_len_blocks_index_
int var_len_blocks_index_
Definition: sorter.cc:186

impala::BufferedBlockMgr::Block::Pin
Status Pin(bool *pinned, Block *release_block=NULL, bool unpin=true)
Definition: buffered-block-mgr.cc:122

impala::BufferedBlockMgr::available_allocated_buffers
int available_allocated_buffers() const
Definition: buffered-block-mgr.h:380

impala::Tuple
A tuple with 0 materialised slots is represented as NULL.
Definition: tuple.h:48

impala::StringValue::len
int len
Definition: string-value.h:38

impala::BufferedBlockMgr::Block::Allocate
T * Allocate(int size)
Allocates the specified number of bytes from this block.
Definition: buffered-block-mgr.h:160

impala::RowBatch::row_desc
const RowDescriptor & row_desc() const
Definition: row-batch.h:218

impala::Sorter::Run::DeleteAllBlocks
void DeleteAllBlocks()
Definition: sorter.cc:468

RETURN_IF_ERROR
#define RETURN_IF_ERROR(stmt)
some generally useful macros
Definition: status.h:242

impala::RowBatch::GetRow
TupleRow * GetRow(int row_idx)
Definition: row-batch.h:140

impala::Sorter::TupleSorter::state_
RuntimeState *const state_
Definition: sorter.cc:305

impala::Sorter::TupleSorter::tuple_size_
const int tuple_size_
Definition: sorter.cc:293

impala::Sorter::TupleSorter::TupleSorter
TupleSorter(const TupleRowComparator &less_than_comp, int64_t block_size, int tuple_size, RuntimeState *state)
Definition: sorter.cc:758

ADD_TIMER
#define ADD_TIMER(profile, name)
Definition: runtime-profile.h:50

impala::BufferedBlockMgr::RegisterClient
Status RegisterClient(int num_reserved_buffers, MemTracker *tracker, RuntimeState *state, Client **client)
Definition: buffered-block-mgr.cc:235

impala::Sorter::Init
Status Init()
Definition: sorter.cc:896

impala::Sorter::CreateMerger
Status CreateMerger(int num_runs)
Definition: sorter.cc:1123

impala::Sorter::TupleSorter::TupleIterator::Next
void Next()
Definition: sorter.cc:247

impala::BufferedBlockMgr::max_block_size
int64_t max_block_size() const
Definition: buffered-block-mgr.h:386

impala::RowBatch::AtCapacity
bool AtCapacity()
Definition: row-batch.h:120

impala::Tuple::GetSlot
void * GetSlot(int offset)
Definition: tuple.h:118

impala::RowDescriptor
Definition: descriptors.h:373

impala::Sorter::Run::fixed_len_blocks_
vector< BufferedBlockMgr::Block * > fixed_len_blocks_
Definition: sorter.cc:159

impala::Sorter::TupleSorter::TupleIterator::index_
int64_t index_
Definition: sorter.cc:280

impala::TupleDescriptor::byte_size
int byte_size() const
Definition: descriptors.h:300

impala::SlotDescriptor
Definition: descriptors.h:75

impala::Sorter::TupleSorter::TupleIterator
Definition: sorter.cc:220

impala::Status::AddDetail
void AddDetail(const std::string &msg)
Add a detail string. Calling this method is only defined on a non-OK message.
Definition: status.cc:166

impala::BLOCKS_REQUIRED_FOR_MERGE
const int BLOCKS_REQUIRED_FOR_MERGE
Definition: sorter.cc:31

impala::TupleRow
Definition: tuple-row.h:28

impala::SlotDescriptor::null_indicator_offset
const NullIndicatorOffset & null_indicator_offset() const
Definition: descriptors.h:89

impala::Sorter::profile_
RuntimeProfile * profile_
Runtime profile and counters for this sorter instance.
Definition: sorter.h:195

impala::Sorter::state_
RuntimeState *const state_
Runtime state instance used to check for cancellation. Not owned.
Definition: sorter.h:145

SCOPED_TIMER
#define SCOPED_TIMER(c)
Definition: runtime-profile.h:53

impala::Sorter::compare_less_than_
TupleRowComparator compare_less_than_
In memory sorter and less-than comparator.
Definition: sorter.h:148

impala::Sorter::TupleSorter::TupleIterator::TupleIterator
TupleIterator(TupleSorter *parent, int64_t index)
Definition: sorter.cc:222

impala::Sorter::in_mem_sort_timer_
RuntimeProfile::Counter * in_mem_sort_timer_
Definition: sorter.h:198

impala::Sorter::Run::var_len_copy_block_
BufferedBlockMgr::Block * var_len_copy_block_
Definition: sorter.cc:170

impala::Sorter::Run::pin_next_fixed_len_block_
bool pin_next_fixed_len_block_
Definition: sorter.cc:191

impala::Tuple::IsNull
bool IsNull(const NullIndicatorOffset &offset) const
Definition: tuple.h:112

impala::BufferedBlockMgr::Block::BytesRemaining
int BytesRemaining() const
Return the number of remaining bytes that can be allocated in this block.
Definition: buffered-block-mgr.h:168

impala::BufferedBlockMgr::Block
Definition: buffered-block-mgr.h:136

impala::TupleDescriptor::string_slots
const std::vector< SlotDescriptor * > & string_slots() const
Definition: descriptors.h:303

impala::Sorter::Run::sorter_
const Sorter * sorter_
Definition: sorter.cc:131

impala::Sorter::in_mem_tuple_sorter_
boost::scoped_ptr< TupleSorter > in_mem_tuple_sorter_
Definition: sorter.h:149

impala::BufferedBlockMgr::ClearReservations
void ClearReservations(Client *client)
Clears all reservations for this client.
Definition: buffered-block-mgr.cc:244

impala::BufferedBlockMgr::Block::valid_data_len
int64_t valid_data_len() const
Return the number of bytes allocated in this block.
Definition: buffered-block-mgr.h:187

impala::Sorter::TupleSorter::swap_buffer_
uint8_t * swap_buffer_
Definition: sorter.cc:314

impala::Sorter::Run::sort_tuple_desc_
const TupleDescriptor * sort_tuple_desc_
Definition: sorter.cc:135

impala::Status
Definition: status.h:81

impala::RowBatch::AT_CAPACITY_MEM_USAGE
static const int AT_CAPACITY_MEM_USAGE
Definition: row-batch.h:222

impala::Sorter::TupleSorter::Swap
void Swap(uint8_t *left, uint8_t *right)
Definition: sorter.cc:863

impala::Sorter::sorted_data_size_
RuntimeProfile::Counter * sorted_data_size_
Definition: sorter.h:199

impala::Sorter::TupleSorter::last_tuple_block_offset_
const int last_tuple_block_offset_
Definition: sorter.cc:299

impala::Sorter::TupleSorter
Definition: sorter.cc:202

impala::ColumnType::type
PrimitiveType type
Definition: types.h:60

impala::Sorter::InputDone
Status InputDone()
Definition: sorter.cc:949

impala::Sorter::merger_
boost::scoped_ptr< SortedRunMerger > merger_
Definition: sorter.h:185

impala::Sorter::mem_tracker_
MemTracker * mem_tracker_
Mem tracker for batches created during merge. Not owned by Sorter.
Definition: sorter.h:180

impala::Sorter::TupleSorter::temp_tuple_row_
TupleRow * temp_tuple_row_
Definition: sorter.cc:312

impala::RuntimeState
Definition: runtime-state.h:69

impala::SortedRunMerger
Definition: sorted-run-merger.h:42

impala::SlotDescriptor::type
const ColumnType & type() const
Definition: descriptors.h:78

impala::Sorter::Run::GetNext
Status GetNext(RowBatch *output_batch, bool *eos)
Definition: sorter.cc:609

RETURN_IF_CANCELLED
#define RETURN_IF_CANCELLED(state)
Definition: runtime-state.h:384

impala::Sorter::Run::buffered_batch_
scoped_ptr< RowBatch > buffered_batch_
Definition: sorter.cc:180

impala::Sorter::Run::var_len_blocks_
vector< BufferedBlockMgr::Block * > var_len_blocks_
Definition: sorter.cc:165

impala::Sorter::AddBatch
Status AddBatch(RowBatch *batch)
Adds a batch of input rows to the current unsorted run.
Definition: sorter.cc:923

impala::Sorter::Run::fixed_len_block_offset_
int fixed_len_block_offset_
Definition: sorter.cc:195

ADD_COUNTER
#define ADD_COUNTER(profile, name, unit)
Definition: runtime-profile.h:47

uint64_t

impala::Sorter::initial_runs_counter_
RuntimeProfile::Counter * initial_runs_counter_
Definition: sorter.h:196

impala::Sorter::output_row_desc_
RowDescriptor * output_row_desc_
Definition: sorter.h:173

impala::Sorter::Run::sort_tuple_size_
const int sort_tuple_size_
Definition: sorter.cc:138

impala::BufferedBlockMgr::Block::Delete
Status Delete()
Definition: buffered-block-mgr.cc:130

impala::MemTracker
This class is thread-safe.
Definition: mem-tracker.h:61

impala::RowBatch::CommitLastRow
void CommitLastRow()
Definition: row-batch.h:109

impala::BufferedBlockMgr::num_free_buffers
int num_free_buffers() const
Definition: buffered-block-mgr.h:381

runtime-state.h

impala::RowBatch
Definition: row-batch.h:66

impala::Sorter::Run::CopyVarLenDataConvertOffset
void CopyVarLenDataConvertOffset(char *dest, int64_t offset, const vector< StringValue * > &var_values)
Definition: sorter.cc:747

impala::row_desc
const RowDescriptor & row_desc() const

impala::RuntimeState::batch_size
int batch_size() const
Definition: runtime-state.h:98

impala::Sorter::GetNext
Status GetNext(RowBatch *batch, bool *eos)
Get the next batch of sorted output rows from the sorter.
Definition: sorter.cc:993

sorted-run-merger.h

impala::Status::MEM_LIMIT_EXCEEDED
static const Status MEM_LIMIT_EXCEEDED
Definition: status.h:89

impala::StringValue::ptr
char * ptr
Definition: string-value.h:37

impala::RuntimeState::is_cancelled
bool is_cancelled() const
Definition: runtime-state.h:235

impala::RowDescriptor::tuple_descriptors
const std::vector< TupleDescriptor * > & tuple_descriptors() const
Return descriptors for all tuples in this row, in order of appearance.
Definition: descriptors.h:412

impala::Sorter::Run::num_tuples_returned_
int64_t num_tuples_returned_
Definition: sorter.cc:176

impala::RuntimeProfile
Definition: runtime-profile.h:83

impala::TupleRow::SetTuple
void SetTuple(int tuple_idx, Tuple *tuple)
Definition: tuple-row.h:34

impala::PIN_FAILED_ERROR_MSG
const string PIN_FAILED_ERROR_MSG
Definition: sorter.cc:35

impala::ErrorMsg
Definition: error-util.h:47

impala::RowBatch::AddRow
int AddRow()
Definition: row-batch.h:100

UNLIKELY
#define UNLIKELY(expr)
Definition: compiler-util.h:33

impala::Sorter::sorted_runs_
std::list< Run * > sorted_runs_
Definition: sorter.h:169

impala::Sorter::TupleSorter::block_capacity_
const int block_capacity_
Definition: sorter.cc:296

impala::Sorter::Run::fixed_len_blocks_index_
int fixed_len_blocks_index_
Definition: sorter.cc:185

impala::Sorter::Run::TryAddBlock
Status TryAddBlock(vector< BufferedBlockMgr::Block * > *block_sequence, bool *added)
Definition: sorter.cc:716

impala::Status::OK
static const Status OK
Definition: status.h:87

impala::Sorter::TupleSorter::temp_tuple_buffer_
uint8_t * temp_tuple_buffer_
Definition: sorter.cc:313

impala::Sorter::Run
Definition: sorter.cc:45

impala::Sorter
Note that Init() must be called right after the constructor.
Definition: sorter.h:84

impala::Sorter::block_mgr_client_
BufferedBlockMgr::Client * block_mgr_client_
Handle to block mgr to make allocations from.
Definition: sorter.h:155

offset
uint8_t offset[7 *64-sizeof(uint64_t)]
Definition: partitioning-throughput-test.cc:37

impala::Sorter::Run::is_sorted_
bool is_sorted_
Definition: sorter.cc:150

impala::SlotDescriptor::tuple_offset
int tuple_offset() const
Definition: descriptors.h:88

impala::Sorter::Sorter
Sorter(const TupleRowComparator &compare_less_than, const std::vector< ExprContext * > &sort_tuple_slot_expr_ctxs, RowDescriptor *output_row_desc, MemTracker *mem_tracker, RuntimeProfile *profile, RuntimeState *state)
Definition: sorter.cc:870

impala::Sorter::TupleSorter::TupleIterator::buffer_start_
uint8_t * buffer_start_
Definition: sorter.cc:286

impala::Sorter::merging_runs_
std::list< Run * > merging_runs_
Definition: sorter.h:189

names.h

impala::Sorter::Run::UnpinAllBlocks
Status UnpinAllBlocks()
Definition: sorter.cc:478

impala::ObjectPool::Add
T * Add(T *t)
Definition: object-pool.h:42

impala::Tuple::MaterializeExprs
void MaterializeExprs(TupleRow *row, const TupleDescriptor &desc, const std::vector< ExprContext * > &materialize_expr_ctxs, MemPool *pool, std::vector< StringValue * > *non_null_var_len_values=NULL, int *total_var_len=NULL)

impala::Sorter::~Sorter
~Sorter()
Definition: sorter.cc:884

impala::TYPE_STRING
Definition: types.h:38

impala::BufferedBlockMgr::Block::buffer
uint8_t * buffer() const
Definition: buffered-block-mgr.h:181

impala::RuntimeProfile::Counter::Add
virtual void Add(int64_t delta)
Definition: runtime-profile.h:93

impala::Sorter::Run::Init
Status Init()
Definition: sorter.cc:350

impala::Sorter::TupleSorter::TupleIterator::current_tuple_
uint8_t * current_tuple_
Definition: sorter.cc:283

buffered-block-mgr.h

impala::Sorter::Run::PrepareRead
Status PrepareRead()
Definition: sorter.cc:535

impala::Sorter::Run::is_pinned_
bool is_pinned_
Definition: sorter.cc:153

impala::Sorter::MergeIntermediateRuns
Status MergeIntermediateRuns()
Definition: sorter.cc:1054

impala::BufferedBlockMgr::Block::Unpin
Status Unpin()
Definition: buffered-block-mgr.cc:126

impala::Sorter::Run::block_size_
const int block_size_
Definition: sorter.cc:139

impala::Sorter::obj_pool_
ObjectPool obj_pool_
Pool of owned Run objects.
Definition: sorter.h:192

impala::Sorter::unsorted_run_
Run * unsorted_run_
Definition: sorter.h:164

impala::Sorter::has_var_len_slots_
bool has_var_len_slots_
True if the tuples to be sorted have var-length slots.
Definition: sorter.h:158

impala::Sorter::block_mgr_
BufferedBlockMgr * block_mgr_
Block manager object used to allocate, pin and release runs. Not owned by Sorter. ...
Definition: sorter.h:152

impala::Sorter::TupleSorter::TupleIterator::block_index_
int block_index_
Definition: sorter.cc:289

impala::Sorter::Run::materialize_slots_
const bool materialize_slots_
Definition: sorter.cc:146

impala::Sorter::Run::CollectNonNullVarSlots
void CollectNonNullVarSlots(Tuple *src, vector< StringValue * > *var_len_values, int *total_var_len)
Definition: sorter.cc:702

impala::Sorter::TupleSorter::SortHelper
void SortHelper(TupleIterator first, TupleIterator last)
Definition: sorter.cc:845

impala::Sorter::TupleSorter::Partition
TupleIterator Partition(TupleIterator first, TupleIterator last, Tuple *pivot)
Definition: sorter.cc:817

impala::BufferedBlockMgr::Block::ReturnAllocation
void ReturnAllocation(int size)
Return size bytes from the most recent allocation.
Definition: buffered-block-mgr.h:174

impala::Sorter::Run::pin_next_var_len_block_
bool pin_next_var_len_block_
Definition: sorter.cc:192

impala::Sorter::TupleSorter::TupleIterator::parent_
TupleSorter * parent_
Definition: sorter.cc:277