doc/html/coordinator_8cc_source.html

 // Copyright 2012 Cloudera Inc.

 //

 // Licensed under the Apache License, Version 2.0 (the "License");

 // you may not use this file except in compliance with the License.

 // You may obtain a copy of the License at

 //

 // http://www.apache.org/licenses/LICENSE-2.0

 //

 // Unless required by applicable law or agreed to in writing, software

 // distributed under the License is distributed on an "AS IS" BASIS,

 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 // See the License for the specific language governing permissions and

 // limitations under the License.


 #include "runtime/coordinator.h"


 #include <limits>

 #include <map>

 #include <thrift/protocol/TDebugProtocol.h>

 #include <boost/algorithm/string/join.hpp>

 #include <boost/accumulators/accumulators.hpp>

 #include <boost/accumulators/statistics/stats.hpp>

 #include <boost/accumulators/statistics/min.hpp>

 #include <boost/accumulators/statistics/mean.hpp>

 #include <boost/accumulators/statistics/median.hpp>

 #include <boost/accumulators/statistics/max.hpp>

 #include <boost/accumulators/statistics/variance.hpp>

 #include <boost/bind.hpp>

 #include <boost/filesystem.hpp>

 #include <boost/foreach.hpp>

 #include <boost/lexical_cast.hpp>

 #include <boost/unordered_set.hpp>

 #include <boost/algorithm/string/split.hpp>

 #include <boost/algorithm/string.hpp>

 #include <gutil/strings/substitute.h>


 #include "common/logging.h"

 #include "exprs/expr.h"

 #include "exec/data-sink.h"

 #include "runtime/client-cache.h"

 #include "runtime/data-stream-sender.h"

 #include "runtime/data-stream-mgr.h"

 #include "runtime/exec-env.h"

 #include "runtime/hdfs-fs-cache.h"

 #include "runtime/plan-fragment-executor.h"

 #include "runtime/row-batch.h"

 #include "runtime/parallel-executor.h"

 #include "statestore/scheduler.h"

 #include "exec/data-sink.h"

 #include "exec/scan-node.h"

 #include "util/container-util.h"

 #include "util/debug-util.h"

 #include "util/error-util.h"

 #include "util/hdfs-bulk-ops.h"

 #include "util/hdfs-util.h"

 #include "util/llama-util.h"

 #include "util/network-util.h"

 #include "util/pretty-printer.h"

 #include "util/summary-util.h"

 #include "gen-cpp/ImpalaInternalService.h"

 #include "gen-cpp/ImpalaInternalService_types.h"

 #include "gen-cpp/Frontend_types.h"

 #include "gen-cpp/PlanNodes_types.h"

 #include "gen-cpp/Partitions_types.h"

 #include "gen-cpp/ImpalaInternalService_constants.h"


 #include "common/names.h"


 using namespace apache::thrift;

 using namespace strings;

 namespace accumulators = boost::accumulators;

 using boost::algorithm::iequals;

 using boost::algorithm::is_any_of;

 using boost::algorithm::join;

 using boost::algorithm::token_compress_on;

 using boost::algorithm::split;

 using boost::filesystem::path;


 DECLARE_int32(be_port);

 DECLARE_string(hostname);


 DEFINE_bool(insert_inherit_permissions, false, "If true, new directories created by "

     "INSERTs will inherit the permissions of their parent directories");


 namespace impala {


 // container for debug options in TPlanFragmentExecParams (debug_node, debug_action,

 // debug_phase)

 struct DebugOptions {

   int backend_num;

   int node_id;

   TDebugAction::type action;

   TExecNodePhase::type phase;  // INVALID: debug options invalid


   DebugOptions()

     : backend_num(-1), node_id(-1), action(TDebugAction::WAIT),

       phase(TExecNodePhase::INVALID) {}

 };


 // Execution state of a particular fragment.

 // Concurrent accesses:

 // - GetNodeThroughput() called when coordinator's profile is printed

 // - updates through UpdateFragmentExecStatus()

 class Coordinator::BackendExecState {

  public:

   TUniqueId fragment_instance_id;

   MonotonicStopWatch stopwatch;  // wall clock timer for this fragment

   const TNetworkAddress backend_address;  // of ImpalaInternalService

   int64_t total_split_size;  // summed up across all splits; in bytes


   // assembled in c'tor

   TExecPlanFragmentParams rpc_params;


   // Fragment idx for this ExecState

   int fragment_idx;


   // The 0-based instance idx.

   int instance_idx;


   // protects fields below

   // lock ordering: Coordinator::lock_ can only get obtained *prior*

   // to lock

   boost::mutex lock;


   // if the status indicates an error status, execution of this fragment

   // has either been aborted by the remote backend (which then reported the error)

   // or cancellation has been initiated; either way, execution must not be cancelled

   Status status;


   bool initiated; // if true, TPlanExecRequest rpc has been sent

   bool done;  // if true, execution terminated; do not cancel in that case

   bool profile_created;  // true after the first call to profile->Update()

   RuntimeProfile* profile;  // owned by obj_pool()

   ErrorLogMap error_log; // errors reported by this backend


   // Total scan ranges complete across all scan nodes

   int64_t total_ranges_complete;


   FragmentInstanceCounters aggregate_counters;


   BackendExecState(QuerySchedule& schedule, Coordinator* coord,

       const TNetworkAddress& coord_address,

       int backend_num, const TPlanFragment& fragment, int fragment_idx,

       const FragmentExecParams& params, int instance_idx,

       DebugOptions* debug_options, ObjectPool* obj_pool)

     : fragment_instance_id(params.instance_ids[instance_idx]),

       backend_address(params.hosts[instance_idx]),

       total_split_size(0),

       fragment_idx(fragment_idx),

       instance_idx(instance_idx),

       initiated(false),

       done(false),

       profile_created(false),

       total_ranges_complete(0) {

     stringstream ss;

     ss << "Instance " << PrintId(fragment_instance_id)

        << " (host=" << backend_address << ")";

     profile = obj_pool->Add(new RuntimeProfile(obj_pool, ss.str()));

     coord->SetExecPlanFragmentParams(schedule, backend_num, fragment, fragment_idx,

         params, instance_idx, coord_address, &rpc_params);

     if (debug_options != NULL) {

       rpc_params.params.__set_debug_node_id(debug_options->node_id);

       rpc_params.params.__set_debug_action(debug_options->action);

       rpc_params.params.__set_debug_phase(debug_options->phase);

     }

     ComputeTotalSplitSize();

   }


   // Computes sum of split sizes of leftmost scan. Call only after setting

   // exec_params.

   void ComputeTotalSplitSize();


   // Return value of throughput counter for given plan_node_id, or 0 if that node

   // doesn't exist.

   // Thread-safe.

   int64_t GetNodeThroughput(int plan_node_id);


   // Return number of completed scan ranges for plan_node_id, or 0 if that node

   // doesn't exist.

   // Thread-safe.

   int64_t GetNumScanRangesCompleted(int plan_node_id);


   // Updates the total number of scan ranges complete for this fragment.  Returns

   // the delta since the last time this was called.

   // lock must be taken before calling this.

   int64_t UpdateNumScanRangesCompleted();

 };


 void Coordinator::BackendExecState::ComputeTotalSplitSize() {

   const PerNodeScanRanges& per_node_scan_ranges = rpc_params.params.per_node_scan_ranges;

   total_split_size = 0;

   BOOST_FOREACH(const PerNodeScanRanges::value_type& entry, per_node_scan_ranges) {

     BOOST_FOREACH(const TScanRangeParams& scan_range_params, entry.second) {

       if (!scan_range_params.scan_range.__isset.hdfs_file_split) continue;

       total_split_size += scan_range_params.scan_range.hdfs_file_split.length;

     }

   }

 }


 int64_t Coordinator::BackendExecState::GetNodeThroughput(int plan_node_id) {

   RuntimeProfile::Counter* counter = NULL;

   {

     lock_guard<mutex> l(lock);

     CounterMap& throughput_counters = aggregate_counters.throughput_counters;

     CounterMap::iterator i = throughput_counters.find(plan_node_id);

     if (i == throughput_counters.end()) return 0;

     counter = i->second;

   }

   DCHECK(counter != NULL);

   // make sure not to hold lock when calling value() to avoid potential deadlocks

   return counter->value();

 }


 int64_t Coordinator::BackendExecState::GetNumScanRangesCompleted(int plan_node_id) {

   RuntimeProfile::Counter* counter = NULL;

   {

     lock_guard<mutex> l(lock);

     CounterMap& ranges_complete = aggregate_counters.scan_ranges_complete_counters;

     CounterMap::iterator i = ranges_complete.find(plan_node_id);

     if (i == ranges_complete.end()) return 0;

     counter = i->second;

   }

   DCHECK(counter != NULL);

   // make sure not to hold lock when calling value() to avoid potential deadlocks

   return counter->value();

 }


 int64_t Coordinator::BackendExecState::UpdateNumScanRangesCompleted() {

   int64_t total = 0;

   CounterMap& complete = aggregate_counters.scan_ranges_complete_counters;

   for (CounterMap::iterator i = complete.begin(); i != complete.end(); ++i) {

     total += i->second->value();

   }

   int64_t delta = total - total_ranges_complete;

   total_ranges_complete = total;

   DCHECK_GE(delta, 0);

   return delta;

 }


 Coordinator::Coordinator(ExecEnv* exec_env, RuntimeProfile::EventSequence* events)

   : exec_env_(exec_env),

     has_called_wait_(false),

     returned_all_results_(false),

     executor_(NULL), // Set in Prepare()

     query_mem_tracker_(), // Set in Exec()

     num_remaining_backends_(0),

     obj_pool_(new ObjectPool()),

     query_events_(events) {

 }


 Coordinator::~Coordinator() {

   query_mem_tracker_.reset();

 }


 TExecNodePhase::type GetExecNodePhase(const string& key) {

   map<int, const char*>::const_iterator entry =

       _TExecNodePhase_VALUES_TO_NAMES.begin();

   for (; entry != _TExecNodePhase_VALUES_TO_NAMES.end(); ++entry) {

     if (iequals(key, (*entry).second)) {

       return static_cast<TExecNodePhase::type>(entry->first);

     }

   }

   return TExecNodePhase::INVALID;

 }


 // TODO: templatize this

 TDebugAction::type GetDebugAction(const string& key) {

   map<int, const char*>::const_iterator entry =

       _TDebugAction_VALUES_TO_NAMES.begin();

   for (; entry != _TDebugAction_VALUES_TO_NAMES.end(); ++entry) {

     if (iequals(key, (*entry).second)) {

       return static_cast<TDebugAction::type>(entry->first);

     }

   }

   return TDebugAction::WAIT;

 }


 static void ProcessQueryOptions(

     const TQueryOptions& query_options, DebugOptions* debug_options) {

   DCHECK(debug_options != NULL);

   if (!query_options.__isset.debug_action || query_options.debug_action.empty()) {

     debug_options->phase = TExecNodePhase::INVALID;  // signal not set

     return;

   }

   vector<string> components;

   split(components, query_options.debug_action, is_any_of(":"), token_compress_on);

   if (components.size() < 3 || components.size() > 4) return;

   if (components.size() == 3) {

     debug_options->backend_num = -1;

     debug_options->node_id = atoi(components[0].c_str());

     debug_options->phase = GetExecNodePhase(components[1]);

     debug_options->action = GetDebugAction(components[2]);

   } else {

     debug_options->backend_num = atoi(components[0].c_str());

     debug_options->node_id = atoi(components[1].c_str());

     debug_options->phase = GetExecNodePhase(components[2]);

     debug_options->action = GetDebugAction(components[3]);

   }

   DCHECK(!(debug_options->phase == TExecNodePhase::CLOSE &&

            debug_options->action == TDebugAction::WAIT))

       << "Do not use CLOSE:WAIT debug actions "

       << "because nodes cannot be cancelled in Close()";

 }


 Status Coordinator::Exec(QuerySchedule& schedule,

     vector<ExprContext*>* output_expr_ctxs) {

   const TQueryExecRequest& request = schedule.request();

   DCHECK_GT(request.fragments.size(), 0);

   needs_finalization_ = request.__isset.finalize_params;

   if (needs_finalization_) {

     finalize_params_ = request.finalize_params;

   }


   VLOG_QUERY << "Exec() query_id=" << schedule.query_id();

   stmt_type_ = request.stmt_type;

   query_id_ = schedule.query_id();

   desc_tbl_ = request.desc_tbl;

   query_ctx_ = request.query_ctx;


   query_profile_.reset(

       new RuntimeProfile(obj_pool(), "Execution Profile " + PrintId(query_id_)));

   finalization_timer_ = ADD_TIMER(query_profile_, "FinalizationTimer");


   SCOPED_TIMER(query_profile_->total_time_counter());


   vector<FragmentExecParams>* fragment_exec_params = schedule.exec_params();

   TNetworkAddress coord = MakeNetworkAddress(FLAGS_hostname, FLAGS_be_port);


   // to keep things simple, make async Cancel() calls wait until plan fragment

   // execution has been initiated, otherwise we might try to cancel fragment

   // execution at backends where it hasn't even started

   lock_guard<mutex> l(lock_);


   // we run the root fragment ourselves if it is unpartitioned

   bool has_coordinator_fragment =

       request.fragments[0].partition.type == TPartitionType::UNPARTITIONED;


   if (has_coordinator_fragment) {

     executor_.reset(new PlanFragmentExecutor(

             exec_env_, PlanFragmentExecutor::ReportStatusCallback()));

     // If a coordinator fragment is requested (for most queries this

     // will be the case, the exception is parallel INSERT queries), start

     // this before starting any more plan fragments in backend threads,

     // otherwise they start sending data before the local exchange node

     // had a chance to register with the stream mgr.

     TExecPlanFragmentParams rpc_params;

     SetExecPlanFragmentParams(schedule, 0, request.fragments[0], 0,

         (*fragment_exec_params)[0], 0, coord, &rpc_params);

     RETURN_IF_ERROR(executor_->Prepare(rpc_params));


     // Prepare output_expr_ctxs before optimizing the LLVM module. The other exprs of this

     // coordinator fragment have been prepared in executor_->Prepare().

     DCHECK(output_expr_ctxs != NULL);

     RETURN_IF_ERROR(Expr::CreateExprTrees(

         runtime_state()->obj_pool(), request.fragments[0].output_exprs,

         output_expr_ctxs));

     MemTracker* output_expr_tracker = runtime_state()->obj_pool()->Add(new MemTracker(

         -1, -1, "Output exprs", runtime_state()->instance_mem_tracker(), false));

     RETURN_IF_ERROR(Expr::Prepare(

         *output_expr_ctxs, runtime_state(), row_desc(), output_expr_tracker));

   } else {

     // The coordinator instance may require a query mem tracker even if there is no

     // coordinator fragment. For example, result-caching tracks memory via the query mem

     // tracker.

     // If there is a fragment, the fragment executor created above initializes the query

     // mem tracker. If not, the query mem tracker is created here.

     int64_t query_limit = -1;

     if (query_ctx_.request.query_options.__isset.mem_limit &&

         query_ctx_.request.query_options.mem_limit > 0) {

       query_limit = query_ctx_.request.query_options.mem_limit;

     }

     MemTracker* pool_tracker = MemTracker::GetRequestPoolMemTracker(

         schedule.request_pool(), exec_env_->process_mem_tracker());

     query_mem_tracker_ =

         MemTracker::GetQueryMemTracker(query_id_, query_limit, -1, pool_tracker, NULL);


     executor_.reset(NULL);

   }


   // Initialize the execution profile structures.

   InitExecProfile(request);


   DebugOptions debug_options;

   ProcessQueryOptions(schedule.query_options(), &debug_options);


   // start fragment instances from left to right, so that receivers have

   // Prepare()'d before senders start sending

   backend_exec_states_.resize(schedule.num_backends());

   num_remaining_backends_ = schedule.num_backends();

   VLOG_QUERY << "starting " << schedule.num_backends()

              << " backends for query " << query_id_;


   query_events_->MarkEvent("Ready to start remote fragments");

   int backend_num = 0;

   StatsMetric<double> latencies("fragment-latencies", TUnit::TIME_NS);

   for (int fragment_idx = (has_coordinator_fragment ? 1 : 0);

        fragment_idx < request.fragments.size(); ++fragment_idx) {

     const FragmentExecParams& params = (*fragment_exec_params)[fragment_idx];


     // set up exec states

     int num_hosts = params.hosts.size();

     DCHECK_GT(num_hosts, 0);

     for (int instance_idx = 0; instance_idx < num_hosts; ++instance_idx) {

       DebugOptions* backend_debug_options =

           (debug_options.phase != TExecNodePhase::INVALID

             && (debug_options.backend_num == -1

                 || debug_options.backend_num == backend_num)

             ? &debug_options

             : NULL);

       // TODO: pool of pre-formatted BackendExecStates?

       BackendExecState* exec_state =

           obj_pool()->Add(new BackendExecState(schedule, this, coord, backend_num,

               request.fragments[fragment_idx], fragment_idx,

               params, instance_idx, backend_debug_options, obj_pool()));

       backend_exec_states_[backend_num] = exec_state;

       ++backend_num;

       VLOG(2) << "Exec(): starting instance: fragment_idx=" << fragment_idx

               << " instance_id=" << params.instance_ids[instance_idx];

     }

     fragment_profiles_[fragment_idx].num_instances = num_hosts;


     // Issue all rpcs in parallel

     Status fragments_exec_status = ParallelExecutor::Exec(

         bind<Status>(mem_fn(&Coordinator::ExecRemoteFragment), this, _1),

         reinterpret_cast<void**>(&backend_exec_states_[backend_num - num_hosts]),

         num_hosts, &latencies);


     if (!fragments_exec_status.ok()) {

       DCHECK(query_status_.ok());  // nobody should have been able to cancel

       query_status_ = fragments_exec_status;

       // tear down running fragments and return

       CancelInternal();

       return fragments_exec_status;

     }

   }


   query_events_->MarkEvent("Remote fragments started");

   query_profile_->AddInfoString("Fragment start latencies",

       latencies.ToHumanReadable());


   // If we have a coordinator fragment and remote fragments (the common case),

   // release the thread token on the coordinator fragment.  This fragment

   // spends most of the time waiting and doing very little work.  Holding on to

   // the token causes underutilization of the machine.  If there are 12 queries

   // on this node, that's 12 tokens reserved for no reason.

   if (has_coordinator_fragment && request.fragments.size() > 1) {

     executor_->ReleaseThreadToken();

   }


   PrintBackendInfo();


   stringstream ss;

   ss << "Query " << query_id_;

   progress_ = ProgressUpdater(ss.str(), schedule.num_scan_ranges());


   return Status::OK;

 }


 Status Coordinator::GetStatus() {

   lock_guard<mutex> l(lock_);

   return query_status_;

 }


 Status Coordinator::UpdateStatus(const Status& status, const TUniqueId* instance_id) {

   {

     lock_guard<mutex> l(lock_);


     // The query is done and we are just waiting for remote fragments to clean up.

     // Ignore their cancelled updates.

     if (returned_all_results_ && status.IsCancelled()) return query_status_;


     // nothing to update

     if (status.ok()) return query_status_;


     // don't override an error status; also, cancellation has already started

     if (!query_status_.ok()) return query_status_;


     query_status_ = status;

     CancelInternal();

   }


   // Log the id of the fragment that first failed so we can track it down easier.

   if (instance_id != NULL) {

     VLOG_QUERY << "Query id=" << query_id_ << " failed because fragment id="

                << *instance_id << " failed.";

   }


   return query_status_;

 }


 void Coordinator::PopulatePathPermissionCache(hdfsFS fs, const string& path_str,

     PermissionCache* permissions_cache) {

   // Find out if the path begins with a hdfs:// -style prefix, and remove it and the

   // location (e.g. host:port) if so.

   int scheme_end = path_str.find("://");

   string stripped_str;

   if (scheme_end != string::npos) {

     // Skip past the subsequent location:port/ prefix.

     stripped_str = path_str.substr(path_str.find("/", scheme_end + 3));

   } else {

     stripped_str = path_str;

   }


   // Get the list of path components, used to build all path prefixes.

   vector<string> components;

   split(components, stripped_str, is_any_of("/"));


   // Build a set of all prefixes (including the complete string) of stripped_path. So

   // /a/b/c/d leads to a vector of: /a, /a/b, /a/b/c, /a/b/c/d

   vector<string> prefixes;

   // Stores the current prefix

   stringstream accumulator;

   BOOST_FOREACH(const string& component, components) {

     if (component.empty()) continue;

     accumulator << "/" << component;

     prefixes.push_back(accumulator.str());

   }


   // Now for each prefix, stat() it to see if a) it exists and b) if so what its

   // permissions are. When we meet a directory that doesn't exist, we record the fact that

   // we need to create it, and the permissions of its parent dir to inherit.

   //

   // Every prefix is recorded in the PermissionCache so we don't do more than one stat()

   // for each path. If we need to create the directory, we record it as the pair (true,

   // perms) so that the caller can identify which directories need their permissions

   // explicitly set.


   // Set to the permission of the immediate parent (i.e. the permissions to inherit if the

   // current dir doesn't exist).

   short permissions = 0;

   BOOST_FOREACH(const string& path, prefixes) {

     PermissionCache::const_iterator it = permissions_cache->find(path);

     if (it == permissions_cache->end()) {

       hdfsFileInfo* info = hdfsGetPathInfo(fs, path.c_str());

       if (info != NULL) {

         // File exists, so fill the cache with its current permissions.

         permissions_cache->insert(

             make_pair(path, make_pair(false, info->mPermissions)));

         permissions = info->mPermissions;

         hdfsFreeFileInfo(info, 1);

       } else {

         // File doesn't exist, so we need to set its permissions to its immediate parent

         // once it's been created.

         permissions_cache->insert(make_pair(path, make_pair(true, permissions)));

       }

     } else {

       permissions = it->second.second;

     }

   }

 }


 Status Coordinator::FinalizeSuccessfulInsert() {

   PermissionCache permissions_cache;

   hdfsFS hdfs_connection;

   // InsertStmt ensures that all partitions are on the same filesystem as the table's

   // base directory, so opening a single connection is okay.

   // TODO: modify this code so that restriction can be lifted.

   RETURN_IF_ERROR(HdfsFsCache::instance()->GetConnection(

       finalize_params_.hdfs_base_dir, &hdfs_connection));


   // INSERT finalization happens in the five following steps

   // 1. If OVERWRITE, remove all the files in the target directory

   // 2. Create all the necessary partition directories.

   HdfsOperationSet partition_create_ops(&hdfs_connection);

   DescriptorTbl* descriptor_table;

   DescriptorTbl::Create(obj_pool(), desc_tbl_, &descriptor_table);

   HdfsTableDescriptor* hdfs_table = static_cast<HdfsTableDescriptor*>(

       descriptor_table->GetTableDescriptor(finalize_params_.table_id));

   DCHECK(hdfs_table != NULL) << "INSERT target table not known in descriptor table: "

                              << finalize_params_.table_id;


   // Loop over all partitions that were updated by this insert, and create the set of

   // filesystem operations required to create the correct partition structure on disk.

   BOOST_FOREACH(const PartitionStatusMap::value_type& partition, per_partition_status_) {

     SCOPED_TIMER(ADD_CHILD_TIMER(query_profile_, "Overwrite/PartitionCreationTimer",

           "FinalizationTimer"));


     // Look up the partition in the descriptor table.

     stringstream part_path_ss;

     if (partition.second.id == -1) {

       // If this is a non-existant partition, use the default partition location of

       // <base_dir>/part_key_1=val/part_key_2=val/...

       part_path_ss << finalize_params_.hdfs_base_dir << "/" << partition.first;

     } else {

       HdfsPartitionDescriptor* part = hdfs_table->GetPartition(partition.second.id);

       DCHECK(part != NULL) << "Partition " << partition.second.id

                            << " not known in descriptor table";

       part_path_ss << part->location();

     }

     const string& part_path = part_path_ss.str();


     // If this is an overwrite insert, we will need to delete any updated partitions

     if (finalize_params_.is_overwrite) {

       if (partition.first.empty()) {

         // If the root directory is written to, then the table must not be partitioned

         DCHECK(per_partition_status_.size() == 1);

         // We need to be a little more careful, and only delete data files in the root

         // because the tmp directories the sink(s) wrote are there also.

         // So only delete files in the table directory - all files are treated as data

         // files by Hive and Impala, but directories are ignored (and may legitimately

         // be used to store permanent non-table data by other applications).

         int num_files = 0;

         hdfsFileInfo* existing_files =

             hdfsListDirectory(hdfs_connection, part_path.c_str(), &num_files);

         if (existing_files == NULL) {

           return GetHdfsErrorMsg("Could not list directory: ", part_path);

         }

         for (int i = 0; i < num_files; ++i) {

           const string filename = path(existing_files[i].mName).filename().string();

           if (existing_files[i].mKind == kObjectKindFile && !IsHiddenFile(filename)) {

             partition_create_ops.Add(DELETE, existing_files[i].mName);

           }

         }

         hdfsFreeFileInfo(existing_files, num_files);

       } else {

         // This is a partition directory, not the root directory; we can delete

         // recursively with abandon, after checking that it ever existed.

         // TODO: There's a potential race here between checking for the directory

         // and a third-party deleting it.

         if (FLAGS_insert_inherit_permissions) {

           PopulatePathPermissionCache(hdfs_connection, part_path, &permissions_cache);

         }

         if (hdfsExists(hdfs_connection, part_path.c_str()) != -1) {

           partition_create_ops.Add(DELETE_THEN_CREATE, part_path);

         } else {

           // Otherwise just create the directory.

           partition_create_ops.Add(CREATE_DIR, part_path);

         }

       }

     } else {

       if (FLAGS_insert_inherit_permissions) {

         PopulatePathPermissionCache(hdfs_connection, part_path, &permissions_cache);

       }

       if (hdfsExists(hdfs_connection, part_path.c_str()) == -1) {

         partition_create_ops.Add(CREATE_DIR, part_path);

       }

     }

   }


   {

     SCOPED_TIMER(ADD_CHILD_TIMER(query_profile_, "Overwrite/PartitionCreationTimer",

           "FinalizationTimer"));

     if (!partition_create_ops.Execute(exec_env_->hdfs_op_thread_pool(), false)) {

       BOOST_FOREACH(const HdfsOperationSet::Error& err, partition_create_ops.errors()) {

         // It's ok to ignore errors creating the directories, since they may already

         // exist. If there are permission errors, we'll run into them later.

         if (err.first->op() != CREATE_DIR) {

           stringstream ss;

           ss << "Error(s) deleting partition directories. First error (of "

              << partition_create_ops.errors().size() << ") was: " << err.second;

           return Status(ss.str());

         }

       }

     }

   }


   // 3. Move all tmp files

   HdfsOperationSet move_ops(&hdfs_connection);

   HdfsOperationSet dir_deletion_ops(&hdfs_connection);


   BOOST_FOREACH(FileMoveMap::value_type& move, files_to_move_) {

     // Empty destination means delete, so this is a directory. These get deleted in a

     // separate pass to ensure that we have moved all the contents of the directory first.

     if (move.second.empty()) {

       VLOG_ROW << "Deleting file: " << move.first;

       dir_deletion_ops.Add(DELETE, move.first);

     } else {

       VLOG_ROW << "Moving tmp file: " << move.first << " to " << move.second;

       move_ops.Add(RENAME, move.first, move.second);

     }

   }


   {

     SCOPED_TIMER(ADD_CHILD_TIMER(query_profile_, "FileMoveTimer", "FinalizationTimer"));

     if (!move_ops.Execute(exec_env_->hdfs_op_thread_pool(), false)) {

       stringstream ss;

       ss << "Error(s) moving partition files. First error (of "

          << move_ops.errors().size() << ") was: " << move_ops.errors()[0].second;

       return Status(ss.str());

     }

   }


   // 4. Delete temp directories

   {

     SCOPED_TIMER(ADD_CHILD_TIMER(query_profile_, "FileDeletionTimer",

          "FinalizationTimer"));

     if (!dir_deletion_ops.Execute(exec_env_->hdfs_op_thread_pool(), false)) {

       stringstream ss;

       ss << "Error(s) deleting staging directories. First error (of "

          << dir_deletion_ops.errors().size() << ") was: "

          << dir_deletion_ops.errors()[0].second;

       return Status(ss.str());

     }

   }


   // 5. Optionally update the permissions of the created partition directories

   // Do this last in case we make the dirs unwriteable.

   if (FLAGS_insert_inherit_permissions) {

     HdfsOperationSet chmod_ops(&hdfs_connection);

     BOOST_FOREACH(const PermissionCache::value_type& perm, permissions_cache) {

       bool new_dir = perm.second.first;

       if (new_dir) {

         short permissions = perm.second.second;

         VLOG_QUERY << "INSERT created new directory: " << perm.first

                    << ", inherited permissions are: " << oct << permissions;

         chmod_ops.Add(CHMOD, perm.first, permissions);

       }

     }

     if (!chmod_ops.Execute(exec_env_->hdfs_op_thread_pool(), false)) {

       stringstream ss;

       ss << "Error(s) setting permissions on newly created partition directories. First"

          << " error (of " << chmod_ops.errors().size() << ") was: "

          << chmod_ops.errors()[0].second;

       return Status(ss.str());

     }

   }


   return Status::OK;

 }


 Status Coordinator::FinalizeQuery() {

   // All backends must have reported their final statuses before finalization, which is a

   // post-condition of Wait. If the query was not successful, still try to clean up the

   // staging directory.

   DCHECK(has_called_wait_);

   DCHECK(needs_finalization_);


   VLOG_QUERY << "Finalizing query: " << query_id_;

   SCOPED_TIMER(finalization_timer_);

   Status return_status = GetStatus();

   if (return_status.ok()) {

     return_status = FinalizeSuccessfulInsert();

   }


   stringstream staging_dir;

   DCHECK(finalize_params_.__isset.staging_dir);

   staging_dir << finalize_params_.staging_dir << "/" << PrintId(query_id_,"_") << "/";


   hdfsFS hdfs_conn;

   RETURN_IF_ERROR(HdfsFsCache::instance()->GetConnection(staging_dir.str(), &hdfs_conn));

   VLOG_QUERY << "Removing staging directory: " << staging_dir.str();

   hdfsDelete(hdfs_conn, staging_dir.str().c_str(), 1);


   return return_status;

 }


 Status Coordinator::WaitForAllBackends() {

   unique_lock<mutex> l(lock_);

   while (num_remaining_backends_ > 0 && query_status_.ok()) {

     VLOG_QUERY << "Coordinator waiting for backends to finish, "

                << num_remaining_backends_ << " remaining";

     backend_completion_cv_.wait(l);

   }

   if (query_status_.ok()) {

     VLOG_QUERY << "All backends finished successfully.";

   } else {

     VLOG_QUERY << "All backends finished due to one or more errors.";

   }


   return query_status_;

 }


 Status Coordinator::Wait() {

   lock_guard<mutex> l(wait_lock_);

   SCOPED_TIMER(query_profile_->total_time_counter());

   if (has_called_wait_) return Status::OK;

   has_called_wait_ = true;

   Status return_status = Status::OK;

   if (executor_.get() != NULL) {

     // Open() may block

     return_status = UpdateStatus(executor_->Open(), NULL);


     if (return_status.ok()) {

       // If the coordinator fragment has a sink, it will have finished executing at this

       // point.  It's safe therefore to copy the set of files to move and updated

       // partitions into the query-wide set.

       RuntimeState* state = runtime_state();

       DCHECK(state != NULL);


       // No other backends should have updated these structures if the coordinator has a

       // fragment.  (Backends have a sink only if the coordinator does not)

       DCHECK_EQ(files_to_move_.size(), 0);

       DCHECK_EQ(per_partition_status_.size(), 0);


       // Because there are no other updates, safe to copy the maps rather than merge them.

       files_to_move_ = *state->hdfs_files_to_move();

       per_partition_status_ = *state->per_partition_status();

     }

   } else {

     // Query finalization can only happen when all backends have reported

     // relevant state. They only have relevant state to report in the parallel

     // INSERT case, otherwise all the relevant state is from the coordinator

     // fragment which will be available after Open() returns.

     // Ignore the returned status if finalization is required., since FinalizeQuery() will

     // pick it up and needs to execute regardless.

     Status status = WaitForAllBackends();

     if (!needs_finalization_ && !status.ok()) return status;

   }


   // Query finalization is required only for HDFS table sinks

   if (needs_finalization_) {

     RETURN_IF_ERROR(FinalizeQuery());

   }


   if (stmt_type_ == TStmtType::DML) {

     query_profile_->AddInfoString("Insert Stats",

         DataSink::OutputInsertStats(per_partition_status_, "\n"));

     // For DML queries, when Wait is done, the query is complete.  Report aggregate

     // query profiles at this point.

     // TODO: make sure ReportQuerySummary gets called on error

     ReportQuerySummary();

   }


   return return_status;

 }


 Status Coordinator::GetNext(RowBatch** batch, RuntimeState* state) {

   VLOG_ROW << "GetNext() query_id=" << query_id_;

   DCHECK(has_called_wait_);

   SCOPED_TIMER(query_profile_->total_time_counter());


   if (executor_.get() == NULL) {

     // If there is no local fragment, we produce no output, and execution will

     // have finished after Wait.

     *batch = NULL;

     return GetStatus();

   }


   // do not acquire lock_ here, otherwise we could block and prevent an async

   // Cancel() from proceeding

   Status status = executor_->GetNext(batch);


   // if there was an error, we need to return the query's error status rather than

   // the status we just got back from the local executor (which may well be CANCELLED

   // in that case).  Coordinator fragment failed in this case so we log the query_id.

   RETURN_IF_ERROR(UpdateStatus(status, &runtime_state()->fragment_instance_id()));


   if (*batch == NULL) {

     returned_all_results_ = true;

     if (executor_->ReachedLimit()) {

       // We've reached the query limit, cancel the remote fragments.  The

       // Exchange node on our fragment is no longer receiving rows so the

       // remote fragments must be explicitly cancelled.

       CancelRemoteFragments();

       RuntimeState* state = runtime_state();

       if (state != NULL) {

         // Cancel the streams receiving batches.  The exchange nodes that would

         // normally read from the streams are done.

         state->stream_mgr()->Cancel(state->fragment_instance_id());

       }

     }


     // Don't return final NULL until all backends have completed.

     // GetNext must wait for all backends to complete before

     // ultimately signalling the end of execution via a NULL

     // batch. After NULL is returned, the coordinator may tear down

     // query state, and perform post-query finalization which might

     // depend on the reports from all backends.

     RETURN_IF_ERROR(WaitForAllBackends());

     if (query_status_.ok()) {

       // If the query completed successfully, report aggregate query profiles.

       ReportQuerySummary();

     }

   }

   return Status::OK;

 }


 void Coordinator::PrintBackendInfo() {

   for (int i = 0; i < backend_exec_states_.size(); ++i) {

     SummaryStats& acc =

         fragment_profiles_[backend_exec_states_[i]->fragment_idx].bytes_assigned;

     acc(backend_exec_states_[i]->total_split_size);

   }


   for (int i = (executor_.get() == NULL ? 0 : 1); i < fragment_profiles_.size(); ++i) {

     SummaryStats& acc = fragment_profiles_[i].bytes_assigned;

     double min = accumulators::min(acc);

     double max = accumulators::max(acc);

     double mean = accumulators::mean(acc);

     double stddev = sqrt(accumulators::variance(acc));

     stringstream ss;

     ss << " min: " << PrettyPrinter::Print(min, TUnit::BYTES)

       << ", max: " << PrettyPrinter::Print(max, TUnit::BYTES)

       << ", avg: " << PrettyPrinter::Print(mean, TUnit::BYTES)

       << ", stddev: " << PrettyPrinter::Print(stddev, TUnit::BYTES);

     fragment_profiles_[i].averaged_profile->AddInfoString("split sizes", ss.str());


     if (VLOG_FILE_IS_ON) {

       VLOG_FILE << "Byte split for fragment " << i << " " << ss.str();

       for (int j = 0; j < backend_exec_states_.size(); ++j) {

         BackendExecState* exec_state = backend_exec_states_[j];

         if (exec_state->fragment_idx != i) continue;

         VLOG_FILE << "data volume for ipaddress " << exec_state << ": "

                   << PrettyPrinter::Print(

                     exec_state->total_split_size, TUnit::BYTES);

       }

     }

   }

 }


 void Coordinator::InitExecProfile(const TQueryExecRequest& request) {

   // Initialize the structure to collect execution summary of every plan node.

   exec_summary_.__isset.nodes = true;

   for (int i = 0; i < request.fragments.size(); ++i) {

     if (!request.fragments[i].__isset.plan) continue;

     const TPlan& plan = request.fragments[i].plan;

     int fragment_first_node_idx = exec_summary_.nodes.size();


     for (int j = 0; j < plan.nodes.size(); ++j) {

       TPlanNodeExecSummary node;

       node.node_id = plan.nodes[j].node_id;

       node.fragment_id = i;

       node.label = plan.nodes[j].label;

       node.__set_label_detail(plan.nodes[j].label_detail);

       node.num_children = plan.nodes[j].num_children;


       if (plan.nodes[j].__isset.estimated_stats) {

         node.__set_estimated_stats(plan.nodes[j].estimated_stats);

       }


       plan_node_id_to_summary_map_[plan.nodes[j].node_id] = exec_summary_.nodes.size();

       exec_summary_.nodes.push_back(node);

     }


     if (request.fragments[i].__isset.output_sink &&

         request.fragments[i].output_sink.type == TDataSinkType::DATA_STREAM_SINK) {

       const TDataStreamSink& sink = request.fragments[i].output_sink.stream_sink;

       int exch_idx = plan_node_id_to_summary_map_[sink.dest_node_id];

       if (sink.output_partition.type == TPartitionType::UNPARTITIONED) {

         exec_summary_.nodes[exch_idx].__set_is_broadcast(true);

       }

       exec_summary_.__isset.exch_to_sender_map = true;

       exec_summary_.exch_to_sender_map[exch_idx] = fragment_first_node_idx;

     }

   }


   if (executor_.get() != NULL) {

     // register coordinator's fragment profile now, before those of the backends,

     // so it shows up at the top

     query_profile_->AddChild(executor_->profile());

     executor_->profile()->set_name(Substitute("Coordinator Fragment $0",

         request.fragments[0].display_name));

     CollectScanNodeCounters(executor_->profile(), &coordinator_counters_);

   }


   // Initialize the runtime profile structure. This adds the per fragment average

   // profiles followed by the per fragment instance profiles.

   bool has_coordinator_fragment =

       request.fragments[0].partition.type == TPartitionType::UNPARTITIONED;

   fragment_profiles_.resize(request.fragments.size());

   for (int i = 0; i < request.fragments.size(); ++i) {

     fragment_profiles_[i].num_instances = 0;


     // Special case fragment idx 0 if there is a coordinator. There is only one

     // instance of this profile so the average is just the coordinator profile.

     if (i == 0 && has_coordinator_fragment) {

       fragment_profiles_[i].averaged_profile = executor_->profile();

       fragment_profiles_[i].num_instances = 1;

       continue;

     }

     fragment_profiles_[i].averaged_profile =

         obj_pool()->Add(new RuntimeProfile(obj_pool(),

             Substitute("Averaged Fragment $0", request.fragments[i].display_name), true));

     // Insert the avg profiles in ascending fragment number order. If

     // there is a coordinator fragment, it's been placed in

     // fragment_profiles_[0].averaged_profile, ensuring that this code

     // will put the first averaged profile immediately after it. If

     // there is no coordinator fragment, the first averaged profile

     // will be inserted as the first child of query_profile_, and then

     // all other averaged fragments will follow.

     query_profile_->AddChild(fragment_profiles_[i].averaged_profile, true,

         (i > 0) ? fragment_profiles_[i-1].averaged_profile : NULL);


     fragment_profiles_[i].root_profile =

         obj_pool()->Add(new RuntimeProfile(obj_pool(),

             Substitute("Fragment $0", request.fragments[i].display_name)));

     // Note: we don't start the wall timer here for the fragment

     // profile; it's uninteresting and misleading.

     query_profile_->AddChild(fragment_profiles_[i].root_profile);

   }

 }


 void Coordinator::CollectScanNodeCounters(RuntimeProfile* profile,

     FragmentInstanceCounters* counters) {

   vector<RuntimeProfile*> children;

   profile->GetAllChildren(&children);

   for (int i = 0; i < children.size(); ++i) {

     RuntimeProfile* p = children[i];

     PlanNodeId id = ExecNode::GetNodeIdFromProfile(p);


     // This profile is not for an exec node.

     if (id == g_ImpalaInternalService_constants.INVALID_PLAN_NODE_ID) continue;


     RuntimeProfile::Counter* throughput_counter =

         p->GetCounter(ScanNode::TOTAL_THROUGHPUT_COUNTER);

     if (throughput_counter != NULL) {

       counters->throughput_counters[id] = throughput_counter;

     }

     RuntimeProfile::Counter* scan_ranges_counter =

         p->GetCounter(ScanNode::SCAN_RANGES_COMPLETE_COUNTER);

     if (scan_ranges_counter != NULL) {

       counters->scan_ranges_complete_counters[id] = scan_ranges_counter;

     }

   }

 }


 void Coordinator::CreateAggregateCounters(

     const vector<TPlanFragment>& fragments) {

   BOOST_FOREACH(const TPlanFragment& fragment, fragments) {

     if (!fragment.__isset.plan) continue;

     const vector<TPlanNode>& nodes = fragment.plan.nodes;

     BOOST_FOREACH(const TPlanNode& node, nodes) {

       if (node.node_type != TPlanNodeType::HDFS_SCAN_NODE

           && node.node_type != TPlanNodeType::HBASE_SCAN_NODE) {

         continue;

       }


       stringstream s;

       s << PrintPlanNodeType(node.node_type) << " (id="

         << node.node_id << ") Throughput";

       query_profile_->AddDerivedCounter(s.str(), TUnit::BYTES_PER_SECOND,

           bind<int64_t>(mem_fn(&Coordinator::ComputeTotalThroughput),

                         this, node.node_id));

       s.str("");

       s << PrintPlanNodeType(node.node_type) << " (id="

         << node.node_id << ") Completed scan ranges";

       query_profile_->AddDerivedCounter(s.str(), TUnit::UNIT,

           bind<int64_t>(mem_fn(&Coordinator::ComputeTotalScanRangesComplete),

                         this, node.node_id));

     }

   }

 }


 int64_t Coordinator::ComputeTotalThroughput(int node_id) {

   int64_t value = 0;

   for (int i = 0; i < backend_exec_states_.size(); ++i) {

     BackendExecState* exec_state = backend_exec_states_[i];

     value += exec_state->GetNodeThroughput(node_id);

   }

   // Add up the local fragment throughput counter

   CounterMap& throughput_counters = coordinator_counters_.throughput_counters;

   CounterMap::iterator it = throughput_counters.find(node_id);

   if (it != throughput_counters.end()) {

     value += it->second->value();

   }

   return value;

 }


 int64_t Coordinator::ComputeTotalScanRangesComplete(int node_id) {

   int64_t value = 0;

   for (int i = 0; i < backend_exec_states_.size(); ++i) {

     BackendExecState* exec_state = backend_exec_states_[i];

     value += exec_state->GetNumScanRangesCompleted(node_id);

   }

   // Add up the local fragment throughput counter

   CounterMap& scan_ranges_complete = coordinator_counters_.scan_ranges_complete_counters;

   CounterMap::iterator it = scan_ranges_complete.find(node_id);

   if (it != scan_ranges_complete.end()) {

     value += it->second->value();

   }

   return value;

 }


 Status Coordinator::ExecRemoteFragment(void* exec_state_arg) {

   BackendExecState* exec_state = reinterpret_cast<BackendExecState*>(exec_state_arg);

   VLOG_FILE << "making rpc: ExecPlanFragment query_id=" << query_id_

             << " instance_id=" << exec_state->fragment_instance_id

             << " host=" << exec_state->backend_address;

   lock_guard<mutex> l(exec_state->lock);


   Status status;

   ImpalaInternalServiceConnection backend_client(

       exec_env_->impalad_client_cache(), exec_state->backend_address, &status);

   RETURN_IF_ERROR(status);


   TExecPlanFragmentResult thrift_result;

   Status rpc_status = backend_client.DoRpc(&ImpalaInternalServiceClient::ExecPlanFragment,

       exec_state->rpc_params, &thrift_result);

   if (!rpc_status.ok()) {

     stringstream msg;

     msg << "ExecPlanRequest rpc query_id=" << query_id_

         << " instance_id=" << exec_state->fragment_instance_id

         << " failed: " << rpc_status.msg().msg();

     VLOG_QUERY << msg.str();

     exec_state->status = Status(msg.str());

     return status;

   }


   exec_state->status = thrift_result.status;

   if (exec_state->status.ok()) {

     exec_state->initiated = true;

     exec_state->stopwatch.Start();

   }

   return exec_state->status;

 }


 void Coordinator::Cancel(const Status* cause) {

   lock_guard<mutex> l(lock_);

   // if the query status indicates an error, cancellation has already been initiated

   if (!query_status_.ok()) return;

   // prevent others from cancelling a second time

   query_status_ = (cause != NULL && !cause->ok()) ? *cause : Status::CANCELLED;

   CancelInternal();

 }


 void Coordinator::CancelInternal() {

   VLOG_QUERY << "Cancel() query_id=" << query_id_;

   DCHECK(!query_status_.ok());


   // cancel local fragment

   if (executor_.get() != NULL) executor_->Cancel();


   CancelRemoteFragments();


   // Report the summary with whatever progress the query made before being cancelled.

   ReportQuerySummary();

 }


 void Coordinator::CancelRemoteFragments() {

   for (int i = 0; i < backend_exec_states_.size(); ++i) {

     BackendExecState* exec_state = backend_exec_states_[i];


     // If a fragment failed before we finished issuing all remote fragments,

     // this function will have been called before we finished populating

     // backend_exec_states_. Skip any such uninitialized exec states.

     if (exec_state == NULL) continue;


     // lock each exec_state individually to synchronize correctly with

     // UpdateFragmentExecStatus() (which doesn't get the global lock_

     // to set its status)

     lock_guard<mutex> l(exec_state->lock);


     // no need to cancel if we already know it terminated w/ an error status

     if (!exec_state->status.ok()) continue;


     // Nothing to cancel if the exec rpc was not sent

     if (!exec_state->initiated) continue;


     // don't cancel if it already finished

     if (exec_state->done) continue;


     // set an error status to make sure we only cancel this once

     exec_state->status = Status::CANCELLED;


     // if we get an error while trying to get a connection to the backend,

     // keep going

     Status status;

     ImpalaInternalServiceConnection backend_client(

         exec_env_->impalad_client_cache(), exec_state->backend_address, &status);

     if (!status.ok()) {

       continue;

     }


     TCancelPlanFragmentParams params;

     params.protocol_version = ImpalaInternalServiceVersion::V1;

     params.__set_fragment_instance_id(exec_state->fragment_instance_id);

     TCancelPlanFragmentResult res;

     VLOG_QUERY << "sending CancelPlanFragment rpc for instance_id="

                << exec_state->fragment_instance_id << " backend="

                << exec_state->backend_address;

     Status rpc_status = backend_client.DoRpc(

         &ImpalaInternalServiceClient::CancelPlanFragment, params, &res);

     if (!rpc_status.ok()) {

       exec_state->status.MergeStatus(rpc_status);

       stringstream msg;

       msg << "CancelPlanFragment rpc query_id=" << query_id_

           << " instance_id=" << exec_state->fragment_instance_id

           << " failed: " << rpc_status.msg().msg();

       // make a note of the error status, but keep on cancelling the other fragments

       exec_state->status.AddDetail(msg.str());

       continue;

     }

     if (res.status.status_code != TErrorCode::OK) {

       exec_state->status.AddDetail(join(res.status.error_msgs, "; "));

     }

   }


   // notify that we completed with an error

   backend_completion_cv_.notify_all();

 }


 Status Coordinator::UpdateFragmentExecStatus(const TReportExecStatusParams& params) {

   VLOG_FILE << "UpdateFragmentExecStatus() query_id=" << query_id_

             << " status=" << params.status.status_code

             << " done=" << (params.done ? "true" : "false");

   if (params.backend_num >= backend_exec_states_.size()) {

     return Status(TErrorCode::INTERNAL_ERROR, "unknown backend number");

   }

   BackendExecState* exec_state = backend_exec_states_[params.backend_num];


   const TRuntimeProfileTree& cumulative_profile = params.profile;

   Status status(params.status);

   {

     lock_guard<mutex> l(exec_state->lock);

     if (!status.ok()) {

       // During query cancellation, exec_state is set to CANCELLED. However, we might

       // process a non-error message from a fragment executor that is sent

       // before query cancellation is invoked. Make sure we don't go from error status to

       // OK.

       exec_state->status = status;

     }

     exec_state->done = params.done;

     if (exec_state->status.ok()) {

       // We can't update this backend's profile if ReportQuerySummary() is running,

       // because it depends on all profiles not changing during its execution (when it

       // calls SortChildren()). ReportQuerySummary() only gets called after

       // WaitForAllBackends() returns or at the end of CancelRemoteFragments().

       // WaitForAllBackends() only returns after all backends have completed (in which

       // case we wouldn't be in this function), or when there's an error, in which case

       // CancelRemoteFragments() is called. CancelRemoteFragments sets all exec_state's

       // statuses to cancelled.

       // TODO: We're losing this profile information. Call ReportQuerySummary only after

       // all backends have completed.

       exec_state->profile->Update(cumulative_profile);


       // Update the average profile for the fragment corresponding to this instance.

       exec_state->profile->ComputeTimeInProfile();

       UpdateAverageProfile(exec_state);

       UpdateExecSummary(exec_state->fragment_idx, exec_state->instance_idx,

           exec_state->profile);

     }

     if (!exec_state->profile_created) {

       CollectScanNodeCounters(exec_state->profile, &exec_state->aggregate_counters);

     }

     exec_state->profile_created = true;


     // Log messages aggregated by type

     if (params.__isset.error_log && params.error_log.size() > 0) {

       // Append the log messages from each update with the global state of the query

       // execution

       MergeErrorMaps(&exec_state->error_log, params.error_log);

       VLOG_FILE << "instance_id=" << exec_state->fragment_instance_id

                 << " error log: " << PrintErrorMapToString(exec_state->error_log);

     }

     progress_.Update(exec_state->UpdateNumScanRangesCompleted());

   }


   if (params.done && params.__isset.insert_exec_status) {

     lock_guard<mutex> l(lock_);

     // Merge in table update data (partitions written to, files to be moved as part of

     // finalization)

     BOOST_FOREACH(const PartitionStatusMap::value_type& partition,

         params.insert_exec_status.per_partition_status) {

       TInsertPartitionStatus* status = &(per_partition_status_[partition.first]);

       status->num_appended_rows += partition.second.num_appended_rows;

       status->id = partition.second.id;

       if (!status->__isset.stats) status->__set_stats(TInsertStats());

       DataSink::MergeInsertStats(partition.second.stats, &status->stats);

     }

     files_to_move_.insert(

         params.insert_exec_status.files_to_move.begin(),

         params.insert_exec_status.files_to_move.end());

   }


   if (VLOG_FILE_IS_ON) {

     stringstream s;

     exec_state->profile->PrettyPrint(&s);

     VLOG_FILE << "profile for query_id=" << query_id_

                << " instance_id=" << exec_state->fragment_instance_id

                << "\n" << s.str();

   }

   // also print the cumulative profile

   // TODO: fix the coordinator/PlanFragmentExecutor, so this isn't needed

   if (VLOG_FILE_IS_ON) {

     stringstream s;

     query_profile_->PrettyPrint(&s);

     VLOG_FILE << "cumulative profile for query_id=" << query_id_

               << "\n" << s.str();

   }


   // for now, abort the query if we see any error except if the error is cancelled

   // and returned_all_results_ is true.

   // (UpdateStatus() initiates cancellation, if it hasn't already been initiated)

   if (!(returned_all_results_ && status.IsCancelled()) && !status.ok()) {

     UpdateStatus(status, &exec_state->fragment_instance_id);

     return Status::OK;

   }


   if (params.done) {

     lock_guard<mutex> l(lock_);

     exec_state->stopwatch.Stop();

     DCHECK_GT(num_remaining_backends_, 0);

     VLOG_QUERY << "Backend " << params.backend_num << " completed, "

                << num_remaining_backends_ - 1 << " remaining: query_id=" << query_id_;

     if (VLOG_QUERY_IS_ON && num_remaining_backends_ > 1) {

       // print host/port info for the first backend that's still in progress as a

       // debugging aid for backend deadlocks

       for (int i = 0; i < backend_exec_states_.size(); ++i) {

         BackendExecState* exec_state = backend_exec_states_[i];

         lock_guard<mutex> l2(exec_state->lock);

         if (!exec_state->done) {

           VLOG_QUERY << "query_id=" << query_id_ << ": first in-progress backend: "

                      << exec_state->backend_address;

           break;

         }

       }

     }

     if (--num_remaining_backends_ == 0) {

       backend_completion_cv_.notify_all();

     }

   }


   return Status::OK;

 }


 const RowDescriptor& Coordinator::row_desc() const {

   DCHECK(executor_.get() != NULL);

   return executor_->row_desc();

 }


 RuntimeState* Coordinator::runtime_state() {

   return executor_.get() == NULL ? NULL : executor_->runtime_state();

 }


 MemTracker* Coordinator::query_mem_tracker() {

   return executor_.get() == NULL ? query_mem_tracker_.get() :

       executor_->runtime_state()->query_mem_tracker();

 }


 bool Coordinator::PrepareCatalogUpdate(TUpdateCatalogRequest* catalog_update) {

   // Assume we are called only after all fragments have completed

   DCHECK(has_called_wait_);


   BOOST_FOREACH(const PartitionStatusMap::value_type& partition, per_partition_status_) {

     catalog_update->created_partitions.insert(partition.first);

   }


   return catalog_update->created_partitions.size() != 0;

 }


 // Comparator to order fragments by descending total time

 typedef struct {

   typedef pair<RuntimeProfile*, bool> Profile;

   bool operator()(const Profile& a, const Profile& b) const {

     // Reverse ordering: we want the longest first

     return

         a.first->total_time_counter()->value() > b.first->total_time_counter()->value();

   }

 } InstanceComparator;


 // Update fragment average profile information from a backend execution state.

 void Coordinator::UpdateAverageProfile(BackendExecState* backend_exec_state) {

   int fragment_idx = backend_exec_state->fragment_idx;

   DCHECK_GE(fragment_idx, 0);

   DCHECK_LT(fragment_idx, fragment_profiles_.size());

   PerFragmentProfileData& data = fragment_profiles_[fragment_idx];


   // No locks are taken since UpdateAverage() and AddChild() take their own locks

   data.averaged_profile->UpdateAverage(backend_exec_state->profile);

   data.root_profile->AddChild(backend_exec_state->profile);

 }


 // Compute fragment summary information from a backend execution state.

 void Coordinator::ComputeFragmentSummaryStats(BackendExecState* backend_exec_state) {

   int fragment_idx = backend_exec_state->fragment_idx;

   DCHECK_GE(fragment_idx, 0);

   DCHECK_LT(fragment_idx, fragment_profiles_.size());

   PerFragmentProfileData& data = fragment_profiles_[fragment_idx];


   int64_t completion_time = backend_exec_state->stopwatch.ElapsedTime();

   data.completion_times(completion_time);

   data.rates(backend_exec_state->total_split_size / (completion_time / 1000.0

     / 1000.0 / 1000.0));


   // Add the child in case it has not been added previously

   // via UpdateAverageProfile(). AddChild() will do nothing if the child

   // already exists.

   data.root_profile->AddChild(backend_exec_state->profile);

 }


 void Coordinator::UpdateExecSummary(int fragment_idx, int instance_idx,

     RuntimeProfile* profile) {

   vector<RuntimeProfile*> children;

   profile->GetAllChildren(&children);


   lock_guard<SpinLock> l(exec_summary_lock_);

   for (int i = 0; i < children.size(); ++i) {

     int id = ExecNode::GetNodeIdFromProfile(children[i]);

     if (id == -1) continue;


     TPlanNodeExecSummary& exec_summary =

         exec_summary_.nodes[plan_node_id_to_summary_map_[id]];

     if (exec_summary.exec_stats.empty()) {

       // First time, make an exec_stats for each instance this plan node is running on.

       DCHECK_LT(fragment_idx, fragment_profiles_.size());

       exec_summary.exec_stats.resize(fragment_profiles_[fragment_idx].num_instances);

     }

     DCHECK_LT(instance_idx, exec_summary.exec_stats.size());

     TExecStats& stats = exec_summary.exec_stats[instance_idx];


     RuntimeProfile::Counter* rows_counter = children[i]->GetCounter("RowsReturned");

     RuntimeProfile::Counter* mem_counter = children[i]->GetCounter("PeakMemoryUsage");

     if (rows_counter != NULL) stats.__set_cardinality(rows_counter->value());

     if (mem_counter != NULL) stats.__set_memory_used(mem_counter->value());

     stats.__set_latency_ns(children[i]->local_time());

     // TODO: we don't track cpu time per node now. Do that.

     exec_summary.__isset.exec_stats = true;

   }

   VLOG(2) << PrintExecSummary(exec_summary_);

 }


 // This function appends summary information to the query_profile_ before

 // outputting it to VLOG.  It adds:

 //   1. Averaged remote fragment profiles (TODO: add outliers)

 //   2. Summary of remote fragment durations (min, max, mean, stddev)

 //   3. Summary of remote fragment rates (min, max, mean, stddev)

 // TODO: add histogram/percentile

 void Coordinator::ReportQuerySummary() {

   // In this case, the query did not even get to start on all the remote nodes,

   // some of the state that is used below might be uninitialized.  In this case,

   // the query has made so little progress, reporting a summary is not very useful.

   if (!has_called_wait_) return;


   // The fragment has finished executing.  Update the profile to compute the

   // fraction of time spent in each node.

   if (executor_.get() != NULL) {

     executor_->profile()->ComputeTimeInProfile();

     UpdateExecSummary(0, 0, executor_->profile());

   }


   if (!backend_exec_states_.empty()) {

     // Average all remote fragments for each fragment.

     for (int i = 0; i < backend_exec_states_.size(); ++i) {

       backend_exec_states_[i]->profile->ComputeTimeInProfile();

       UpdateAverageProfile(backend_exec_states_[i]);

       ComputeFragmentSummaryStats(backend_exec_states_[i]);

       UpdateExecSummary(backend_exec_states_[i]->fragment_idx,

           backend_exec_states_[i]->instance_idx, backend_exec_states_[i]->profile);

     }


     InstanceComparator comparator;

     // Per fragment instances have been collected, output summaries

     for (int i = (executor_.get() != NULL ? 1 : 0); i < fragment_profiles_.size(); ++i) {

       fragment_profiles_[i].root_profile->SortChildren(comparator);

       SummaryStats& completion_times = fragment_profiles_[i].completion_times;

       SummaryStats& rates = fragment_profiles_[i].rates;


       stringstream times_label;

       times_label

         << "min:" << PrettyPrinter::Print(

             accumulators::min(completion_times), TUnit::TIME_NS)

         << "  max:" << PrettyPrinter::Print(

             accumulators::max(completion_times), TUnit::TIME_NS)

         << "  mean: " << PrettyPrinter::Print(

             accumulators::mean(completion_times), TUnit::TIME_NS)

         << "  stddev:" << PrettyPrinter::Print(

             sqrt(accumulators::variance(completion_times)), TUnit::TIME_NS);


       stringstream rates_label;

       rates_label

         << "min:" << PrettyPrinter::Print(

             accumulators::min(rates), TUnit::BYTES_PER_SECOND)

         << "  max:" << PrettyPrinter::Print(

             accumulators::max(rates), TUnit::BYTES_PER_SECOND)

         << "  mean:" << PrettyPrinter::Print(

             accumulators::mean(rates), TUnit::BYTES_PER_SECOND)

         << "  stddev:" << PrettyPrinter::Print(

             sqrt(accumulators::variance(rates)), TUnit::BYTES_PER_SECOND);


       fragment_profiles_[i].averaged_profile->AddInfoString(

           "completion times", times_label.str());

       fragment_profiles_[i].averaged_profile->AddInfoString(

           "execution rates", rates_label.str());

       fragment_profiles_[i].averaged_profile->AddInfoString(

           "num instances", lexical_cast<string>(fragment_profiles_[i].num_instances));

     }


     // Add per node peak memory usage as InfoString

     // Map from Impalad address to peak memory usage of this query

     typedef boost::unordered_map<TNetworkAddress, int64_t> PerNodePeakMemoryUsage;

     PerNodePeakMemoryUsage per_node_peak_mem_usage;

     if (executor_.get() != NULL) {

       // Coordinator fragment is not included in backend_exec_states_.

       RuntimeProfile::Counter* mem_usage_counter =

           executor_->profile()->GetCounter(

               PlanFragmentExecutor::PER_HOST_PEAK_MEM_COUNTER);

       if (mem_usage_counter != NULL) {

         TNetworkAddress coord = MakeNetworkAddress(FLAGS_hostname, FLAGS_be_port);

         per_node_peak_mem_usage[coord] = mem_usage_counter->value();

       }

     }

     for (int i = 0; i < backend_exec_states_.size(); ++i) {

       int64_t initial_usage = 0;

       int64_t* mem_usage = FindOrInsert(&per_node_peak_mem_usage,

           backend_exec_states_[i]->backend_address, initial_usage);

       RuntimeProfile::Counter* mem_usage_counter =

           backend_exec_states_[i]->profile->GetCounter(

               PlanFragmentExecutor::PER_HOST_PEAK_MEM_COUNTER);

       if (mem_usage_counter != NULL && mem_usage_counter->value() > *mem_usage) {

         per_node_peak_mem_usage[backend_exec_states_[i]->backend_address] =

             mem_usage_counter->value();

       }

     }

     stringstream info;

     BOOST_FOREACH(PerNodePeakMemoryUsage::value_type entry, per_node_peak_mem_usage) {

       info << entry.first << "("

            << PrettyPrinter::Print(entry.second, TUnit::BYTES) << ") ";

     }

     query_profile_->AddInfoString("Per Node Peak Memory Usage", info.str());

   }

 }


 string Coordinator::GetErrorLog() {

   ErrorLogMap merged;

   {

     lock_guard<mutex> l(lock_);

     if (executor_.get() != NULL && executor_->runtime_state() != NULL &&

         !executor_->runtime_state()->ErrorLogIsEmpty()) {

       MergeErrorMaps(&merged, executor_->runtime_state()->error_log());

     }

   }

   for (int i = 0; i < backend_exec_states_.size(); ++i) {

     lock_guard<mutex> l(backend_exec_states_[i]->lock);

     if (backend_exec_states_[i]->error_log.size() > 0) {

       MergeErrorMaps(&merged, backend_exec_states_[i]->error_log);

     }

   }

   return PrintErrorMapToString(merged);

 }


 void Coordinator::SetExecPlanFragmentParams(

     QuerySchedule& schedule, int backend_num, const TPlanFragment& fragment,

     int fragment_idx, const FragmentExecParams& params, int instance_idx,

     const TNetworkAddress& coord, TExecPlanFragmentParams* rpc_params) {

   rpc_params->__set_protocol_version(ImpalaInternalServiceVersion::V1);

   rpc_params->__set_fragment(fragment);

   rpc_params->__set_desc_tbl(desc_tbl_);

   TNetworkAddress exec_host = params.hosts[instance_idx];

   if (schedule.HasReservation()) {

     // The reservation has already have been validated at this point.

     TNetworkAddress resource_hostport;

     schedule.GetResourceHostport(exec_host, &resource_hostport);

     map<TNetworkAddress, llama::TAllocatedResource>::const_iterator it =

         schedule.reservation()->allocated_resources.find(resource_hostport);

     // Only set reserved resource if we actually have one for this plan

     // fragment. Otherwise, don't set it (usually this the coordinator fragment), and it

     // won't participate in dynamic RM controls.

     if (it != schedule.reservation()->allocated_resources.end()) {

       rpc_params->__set_reserved_resource(it->second);

       rpc_params->__set_local_resource_address(resource_hostport);

     }

   }

   rpc_params->params.__set_request_pool(schedule.request_pool());

   FragmentScanRangeAssignment::const_iterator it =

       params.scan_range_assignment.find(exec_host);

   // Scan ranges may not always be set, so use an empty structure if so.

   const PerNodeScanRanges& scan_ranges =

       (it != params.scan_range_assignment.end()) ? it->second : PerNodeScanRanges();


   rpc_params->params.__set_per_node_scan_ranges(scan_ranges);

   rpc_params->params.__set_per_exch_num_senders(params.per_exch_num_senders);

   rpc_params->params.__set_destinations(params.destinations);

   rpc_params->params.__set_sender_id(params.sender_id_base + instance_idx);

   rpc_params->__isset.params = true;

   rpc_params->fragment_instance_ctx.__set_query_ctx(query_ctx_);

   rpc_params->fragment_instance_ctx.fragment_instance_id =

       params.instance_ids[instance_idx];

   rpc_params->fragment_instance_ctx.fragment_instance_idx = instance_idx;

   rpc_params->fragment_instance_ctx.num_fragment_instances = params.instance_ids.size();

   rpc_params->fragment_instance_ctx.backend_num = backend_num;

   rpc_params->__isset.fragment_instance_ctx = true;

 }


 }

impala::RuntimeProfile::Counter::value
virtual int64_t value() const
Definition: runtime-profile.h:108

impala::MemTracker::GetQueryMemTracker
static boost::shared_ptr< MemTracker > GetQueryMemTracker(const TUniqueId &id, int64_t byte_limit, int64_t rm_reserved_limit, MemTracker *parent, QueryResourceMgr *res_mgr)
Definition: mem-tracker.cc:156

row-batch.h

impala::DebugOptions::phase
TExecNodePhase::type phase
Definition: coordinator.cc:93

impala::UpdateFragmentExecStatus
Status UpdateFragmentExecStatus(const TReportExecStatusParams &params)

impala::Coordinator::BackendExecState::initiated
bool initiated
Definition: coordinator.cc:130

impala::Coordinator::BackendExecState
Definition: coordinator.cc:104

impala::PrintErrorMapToString
string PrintErrorMapToString(const ErrorLogMap &errors)
Definition: error-util.cc:153

path
string path("/usr/lib/sasl2:/usr/lib64/sasl2:/usr/local/lib/sasl2:/usr/lib/x86_64-linux-gnu/sasl2")

impala::PlanNodeId
int PlanNodeId
Definition: global-types.h:26

impala::~Coordinator
~Coordinator()

impala::DELETE_THEN_CREATE
Definition: hdfs-bulk-ops.h:32

coordinator.h

impala::MonotonicStopWatch
Definition: stopwatch.h:74

impala::runtime_state
RuntimeState * runtime_state()
only valid after calling Exec(), and may return NULL if there is no executor

impala::ExecRemoteFragment
Status ExecRemoteFragment(void *exec_state)

impala::events
client RuntimeProfile::EventSequence * events
Definition: coordinator.h:64

impala::GetNext
Status GetNext(RowBatch **batch, RuntimeState *state)

impala::Coordinator::BackendExecState::rpc_params
TExecPlanFragmentParams rpc_params
Definition: coordinator.cc:112

impala::finalize_params_
TFinalizeParams finalize_params_
Only valid if needs_finalization is true.
Definition: coordinator.h:222

impala::ScanNode::TOTAL_THROUGHPUT_COUNTER
static const std::string TOTAL_THROUGHPUT_COUNTER
Definition: scan-node.h:125

impala::FragmentInstanceCounters
Struct for per fragment instance counters that will be aggregated by the coordinator.
Definition: coordinator.h:207

impala::progress_
ProgressUpdater progress_
Keeps track of number of completed ranges and total scan ranges.
Definition: coordinator.h:230

impala::ExecNode::GetNodeIdFromProfile
static int GetNodeIdFromProfile(RuntimeProfile *p)
Extract node id from p->name().
Definition: exec-node.cc:62

impala::query_id_
TUniqueId query_id_
Definition: coordinator.h:194

impala::Coordinator::BackendExecState::lock
boost::mutex lock
Definition: coordinator.cc:123

impala::obj_pool_
boost::scoped_ptr< ObjectPool > obj_pool_
Object pool owned by the coordinator. Any executor will have its own pool.
Definition: coordinator.h:296

impala::exec_summary_
TExecSummary exec_summary_
Definition: coordinator.h:300

impala::CreateAggregateCounters
void CreateAggregateCounters(const std::vector< TPlanFragment > &fragments)
Create aggregate counters for all scan nodes in any of the fragments.

RETURN_IF_ERROR
#define RETURN_IF_ERROR(stmt)
some generally useful macros
Definition: status.h:242

impala::PopulatePathPermissionCache
void PopulatePathPermissionCache(hdfsFS fs, const std::string &path_str, PermissionCache *permissions_cache)

hdfs-util.h

error-util.h

impala::SetExecPlanFragmentParams
void SetExecPlanFragmentParams(QuerySchedule &schedule, int backend_num, const TPlanFragment &fragment, int fragment_idx, const FragmentExecParams &params, int instance_idx, const TNetworkAddress &coord, TExecPlanFragmentParams *rpc_params)
Fill in rpc_params based on parameters.

impala::lock_
boost::mutex lock_
protects all fields below
Definition: coordinator.h:233

ADD_TIMER
#define ADD_TIMER(profile, name)
Definition: runtime-profile.h:50

data-stream-mgr.h

plan-fragment-executor.h

DECLARE_int32
DECLARE_int32(be_port)

impala::Coordinator::BackendExecState::profile
RuntimeProfile * profile
Definition: coordinator.cc:133

impala::InstanceComparator::Profile
pair< RuntimeProfile *, bool > Profile
Definition: coordinator.cc:1333

impala::FragmentInstanceCounters::throughput_counters
CounterMap throughput_counters
Throughput counters per node.
Definition: coordinator.h:209

impala::desc_tbl_
TDescriptorTable desc_tbl_
copied from TQueryExecRequest; constant across all fragments
Definition: coordinator.h:197

DEFINE_bool
DEFINE_bool(insert_inherit_permissions, false,"If true, new directories created by ""INSERTs will inherit the permissions of their parent directories")

impala::files_to_move_
FileMoveMap files_to_move_
Definition: coordinator.h:293

impala::Coordinator::BackendExecState::fragment_idx
int fragment_idx
Definition: coordinator.cc:115

client-cache.h

impala::ObjectPool
Definition: object-pool.h:30

impala::UpdateExecSummary
void UpdateExecSummary(int fragment_idx, int instance_idx, RuntimeProfile *profile)

impala::ProcessQueryOptions
static void ProcessQueryOptions(const TQueryOptions &query_options, DebugOptions *debug_options)
Definition: coordinator.cc:278

impala::MakeNetworkAddress
TNetworkAddress MakeNetworkAddress(const string &hostname, int port)
Definition: network-util.cc:96

impala::MergeErrorMaps
void MergeErrorMaps(ErrorLogMap *left, const ErrorLogMap &right)
Definition: error-util.cc:159

impala::Coordinator::BackendExecState::backend_address
const TNetworkAddress backend_address
Definition: coordinator.cc:108

impala::PrintId
string PrintId(const TUniqueId &id, const string &separator)
Definition: debug-util.cc:97

logging.h

summary-util.h

impala::Cancel
void Cancel(const Status *cause=NULL)

impala::backend_exec_states_
std::vector< BackendExecState * > backend_exec_states_
BackendExecStates owned by obj_pool()
Definition: coordinator.h:216

DECLARE_string
DECLARE_string(hostname)

impala::DebugOptions::DebugOptions
DebugOptions()
Definition: coordinator.cc:95

ADD_CHILD_TIMER
#define ADD_CHILD_TIMER(profile, name, parent)
Definition: runtime-profile.h:51

scheduler.h

impala::RuntimeProfile::Counter
Definition: runtime-profile.h:85

SCOPED_TIMER
#define SCOPED_TIMER(c)
Definition: runtime-profile.h:53

data-sink.h

impala::Coordinator::BackendExecState::done
bool done
Definition: coordinator.cc:131

impala::ComputeTotalScanRangesComplete
int64_t ComputeTotalScanRangesComplete(int node_id)

impala::PrettyPrinter::Print
static std::string Print(bool value, TUnit::type ignored, bool verbose=false)
Definition: pretty-printer.h:33

impala::Exec
Status Exec(QuerySchedule &schedule, std::vector< ExprContext * > *output_expr_ctxs)

impala::FinalizeQuery
Status FinalizeQuery()

pretty-printer.h

impala::has_called_wait_
bool has_called_wait_
Definition: coordinator.h:227

hdfs-fs-cache.h

impala::CancelRemoteFragments
void CancelRemoteFragments()

impala::PrepareCatalogUpdate
bool PrepareCatalogUpdate(TUpdateCatalogRequest *catalog_update)

impala::Wait
Status Wait()

impala::obj_pool
ObjectPool * obj_pool()
Returns a local object pool.
Definition: coordinator.h:263

impala::Coordinator::BackendExecState::aggregate_counters
FragmentInstanceCounters aggregate_counters
Definition: coordinator.cc:139

impala::exec_summary_lock_
SpinLock exec_summary_lock_
Execution summary for this query.
Definition: coordinator.h:299

impala::HdfsFsCache::instance
static HdfsFsCache * instance()
Definition: hdfs-fs-cache.h:43

impala::FindOrInsert
V * FindOrInsert(std::map< K, V > *m, const K &key, const V &default_val)
Definition: container-util.h:51

impala::ExecEnv::process_mem_tracker
MemTracker * process_mem_tracker()
Definition: exec-env.h:86

VLOG_FILE_IS_ON
#define VLOG_FILE_IS_ON
Definition: logging.h:65

impala::Status
Definition: status.h:81

impala::returned_all_results_
bool returned_all_results_
Definition: coordinator.h:243

impala::RuntimeProfile::EventSequence::MarkEvent
void MarkEvent(const std::string &label)
Definition: runtime-profile.h:286

impala::SummaryStats
boost::accumulators::accumulator_set< int64_t, boost::accumulators::features< boost::accumulators::tag::min, boost::accumulators::tag::max, boost::accumulators::tag::mean, boost::accumulators::tag::variance > > SummaryStats
Definition: coordinator.h:181

impala::ScanNode::SCAN_RANGES_COMPLETE_COUNTER
static const std::string SCAN_RANGES_COMPLETE_COUNTER
Definition: scan-node.h:129

VLOG_QUERY
#define VLOG_QUERY
Definition: logging.h:57

scan-node.h

impala::query_status_
Status query_status_
Definition: coordinator.h:237

impala::PerNodeScanRanges
std::map< TPlanNodeId, std::vector< TScanRangeParams > > PerNodeScanRanges
map from scan node id to a list of scan ranges
Definition: query-schedule.h:35

impala::CounterMap
std::map< PlanNodeId, RuntimeProfile::Counter * > CounterMap
map from id of a scan node to a specific counter in the node's profile
Definition: coordinator.h:204

impala::DebugOptions::backend_num
int backend_num
Definition: coordinator.cc:90

impala::GetExecNodePhase
TExecNodePhase::type GetExecNodePhase(const string &key)
Definition: coordinator.cc:255

impala::GetStatus
Status GetStatus()
Returns query_status_.

impala::CREATE_DIR
Definition: hdfs-bulk-ops.h:30

impala::RENAME
Definition: hdfs-bulk-ops.h:31

impala::ComputeTotalThroughput
int64_t ComputeTotalThroughput(int node_id)

impala::RuntimeState::obj_pool
ObjectPool * obj_pool() const
Definition: runtime-state.h:92

impala::num_remaining_backends_
int num_remaining_backends_
Definition: coordinator.h:278

debug-util.h

impala::QuerySchedule
Definition: query-schedule.h:67

impala::DebugOptions::node_id
int node_id
Definition: coordinator.cc:91

impala::needs_finalization_
bool needs_finalization_
True if the query needs a post-execution step to tidy up.
Definition: coordinator.h:219

impala::ExecEnv::hdfs_op_thread_pool
HdfsOpThreadPool * hdfs_op_thread_pool()
Definition: exec-env.h:89

impala::plan_node_id_to_summary_map_
boost::unordered_map< TPlanNodeId, int > plan_node_id_to_summary_map_
A mapping of plan node ids to index into exec_summary_.nodes.
Definition: coordinator.h:303

impala::coordinator_counters_
FragmentInstanceCounters coordinator_counters_
Throughput counters for the coordinator fragment.
Definition: coordinator.h:342

impala::DataSink::OutputInsertStats
static std::string OutputInsertStats(const PartitionStatusMap &stats, const std::string &prefix="")
Outputs the insert stats contained in the map of insert partition updates to a string.
Definition: data-sink.cc:103

VLOG_ROW
#define VLOG_ROW
Definition: logging.h:59

impala::DataSink::MergeInsertStats
static void MergeInsertStats(const TInsertStats &src_stats, TInsertStats *dst_stats)
Definition: data-sink.cc:90

impala::finalization_timer_
RuntimeProfile::Counter * finalization_timer_
Total time spent in finalization (typically 0 except for INSERT into hdfs tables) ...
Definition: coordinator.h:348

impala::RuntimeProfile::EventSequence
Definition: runtime-profile.h:267

impala::wait_lock_
boost::mutex wait_lock_
ensures single-threaded execution of Wait(); must not hold lock_ when acquiring this ...
Definition: coordinator.h:225

impala::row_desc
const RowDescriptor & row_desc() const

impala::Status::CANCELLED
static const Status CANCELLED
Definition: status.h:88

impala::FragmentInstanceCounters::scan_ranges_complete_counters
CounterMap scan_ranges_complete_counters
Total finished scan ranges per node.
Definition: coordinator.h:212

impala::PermissionCache
boost::unordered_map< std::string, std::pair< bool, short > > PermissionCache
Definition: coordinator.h:450

parallel-executor.h

impala::PrintExecSummary
std::string PrintExecSummary(const TExecSummary &exec_summary)
Print the exec summary as a formatted table.
Definition: summary-util.cc:108

impala::per_partition_status_
PartitionStatusMap per_partition_status_
Definition: coordinator.h:289

hdfs-bulk-ops.h

impala::DebugOptions
Definition: coordinator.cc:89

impala::Coordinator::BackendExecState::BackendExecState
BackendExecState(QuerySchedule &schedule, Coordinator *coord, const TNetworkAddress &coord_address, int backend_num, const TPlanFragment &fragment, int fragment_idx, const FragmentExecParams &params, int instance_idx, DebugOptions *debug_options, ObjectPool *obj_pool)
Definition: coordinator.cc:141

network-util.h

impala::InitExecProfile
void InitExecProfile(const TQueryExecRequest &request)

impala::MemTracker::GetRequestPoolMemTracker
static MemTracker * GetRequestPoolMemTracker(const std::string &pool_name, MemTracker *parent)
Definition: mem-tracker.cc:134

impala::RuntimeProfile
Definition: runtime-profile.h:83

impala::OK
Definition: webserver.cc:115

impala::query_mem_tracker_
boost::shared_ptr< MemTracker > query_mem_tracker_
Definition: coordinator.h:252

impala::WaitForAllBackends
Status WaitForAllBackends()

impala::backend_completion_cv_
boost::condition_variable backend_completion_cv_
Definition: coordinator.h:274

impala::InstanceComparator::operator()
bool operator()(const Profile &a, const Profile &b) const
Definition: coordinator.cc:1334

impala::fragment_profiles_
std::vector< PerFragmentProfileData > fragment_profiles_
Definition: coordinator.h:339

container-util.h

llama-util.h

impala::CHMOD
Definition: hdfs-bulk-ops.h:33

impala::UpdateAverageProfile
void UpdateAverageProfile(BackendExecState *backend_exec_state)

impala::Coordinator::BackendExecState::fragment_instance_id
TUniqueId fragment_instance_id
Definition: coordinator.cc:106

impala::Coordinator::BackendExecState::total_split_size
int64_t total_split_size
Definition: coordinator.cc:109

impala::PrintPlanNodeType
string PrintPlanNodeType(const TPlanNodeType::type &type)
Definition: debug-util.cc:157

data-stream-sender.h

impala::DELETE
Definition: hdfs-bulk-ops.h:29

impala::Coordinator::BackendExecState::status
Status status
Definition: coordinator.cc:128

impala::Status::OK
static const Status OK
Definition: status.h:87

impala::exec_summary
const TExecSummary & exec_summary() const
Definition: coordinator.h:173

impala::PlanFragmentExecutor::ReportStatusCallback
boost::function< void(const Status &status, RuntimeProfile *profile, bool done)> ReportStatusCallback
Definition: plan-fragment-executor.h:74

expr.h

impala::IsHiddenFile
bool IsHiddenFile(const string &filename)
Definition: hdfs-util.cc:50

impala::DescriptorTbl::Create
static Status Create(ObjectPool *pool, const TDescriptorTable &thrift_tbl, DescriptorTbl **tbl)
Definition: descriptors.cc:378

impala::ComputeFragmentSummaryStats
void ComputeFragmentSummaryStats(BackendExecState *backend_exec_state)

impala::exec_env_
ExecEnv * exec_env_
Definition: coordinator.h:193

impala::Coordinator::BackendExecState::profile_created
bool profile_created
Definition: coordinator.cc:132

impala::query_events_
RuntimeProfile::EventSequence * query_events_
Event timeline for this query. Unowned.
Definition: coordinator.h:309

impala::Coordinator::BackendExecState::stopwatch
MonotonicStopWatch stopwatch
Definition: coordinator.cc:107

impala::GetErrorLog
std::string GetErrorLog()

names.h

impala::UpdateStatus
Status UpdateStatus(const Status &status, const TUniqueId *failed_fragment)

impala::ObjectPool::Add
T * Add(T *t)
Definition: object-pool.h:42

impala::Expr::CreateExprTrees
static Status CreateExprTrees(ObjectPool *pool, const std::vector< TExpr > &texprs, std::vector< ExprContext * > *ctxs)
Definition: expr.cc:149

impala::executor_
ImpaladQueryExecutor * executor_
execution state of coordinator fragment
Definition: expr-test.cc:71

impala::Expr::Prepare
static Status Prepare(const std::vector< ExprContext * > &ctxs, RuntimeState *state, const RowDescriptor &row_desc, MemTracker *tracker)

VLOG_FILE
#define VLOG_FILE
Definition: logging.h:58

impala::ImpalaInternalServiceConnection
ClientConnection< ImpalaInternalServiceClient > ImpalaInternalServiceConnection
Definition: client-cache.h:346

impala::ExecEnv
Definition: exec-env.h:53

impala::CancelInternal
void CancelInternal()
Runs cancel logic. Assumes that lock_ is held.

impala::Coordinator::BackendExecState::total_ranges_complete
int64_t total_ranges_complete
Definition: coordinator.cc:137

impala::Status::ok
bool ok() const
Definition: status.h:172

impala::PlanFragmentExecutor::PER_HOST_PEAK_MEM_COUNTER
static const std::string PER_HOST_PEAK_MEM_COUNTER
Name of the counter that is tracking per query, per host peak mem usage.
Definition: plan-fragment-executor.h:134

impala::GetHdfsErrorMsg
string GetHdfsErrorMsg(const string &prefix, const string &file)
Definition: hdfs-util.cc:26

impala::ReportQuerySummary
void ReportQuerySummary()

impala::query_mem_tracker
MemTracker * query_mem_tracker()

impala::DebugOptions::action
TDebugAction::type action
Definition: coordinator.cc:92

impala::HdfsOperationSet::Error
std::pair< const HdfsOp *, std::string > Error
Definition: hdfs-bulk-ops.h:109

impala::FinalizeSuccessfulInsert
Status FinalizeSuccessfulInsert()
Moves all temporary staging files to their final destinations.

impala::query_ctx_
TQueryCtx query_ctx_
Definition: coordinator.h:198

impala::CollectScanNodeCounters
void CollectScanNodeCounters(RuntimeProfile *, FragmentInstanceCounters *result)

impala::Coordinator::BackendExecState::instance_idx
int instance_idx
Definition: coordinator.cc:118

impala::ErrorLogMap
std::map< TErrorCode::type, TErrorLogEntry > ErrorLogMap
Tracks log messages per error code.
Definition: error-util.h:144

impala::ParallelExecutor::Exec
static Status Exec(Function function, void **args, int num_args, StatsMetric< double > *latencies=NULL)
Callers may pass a StatsMetric to gather the latency distribution of task execution.
Definition: parallel-executor.cc:26

impala::query_profile_
boost::scoped_ptr< RuntimeProfile > query_profile_
Aggregate counters for the entire query.
Definition: coordinator.h:306

impala::stmt_type_
TStmtType::type stmt_type_
copied from TQueryExecRequest, governs when to call ReportQuerySummary
Definition: coordinator.h:201

impala::Coordinator::BackendExecState::error_log
ErrorLogMap error_log
Definition: coordinator.cc:134

VLOG_QUERY_IS_ON
#define VLOG_QUERY_IS_ON
Definition: logging.h:64

impala::GetDebugAction
TDebugAction::type GetDebugAction(const string &key)
Definition: coordinator.cc:267

impala::FragmentExecParams
Definition: query-schedule.h:47

impala::PrintBackendInfo
void PrintBackendInfo()

impala::ProgressUpdater::Update
void Update(int64_t delta)
Definition: progress-updater.cc:34

exec-env.h

impala::ExecEnv::impalad_client_cache
ImpalaInternalServiceClientCache * impalad_client_cache()
Definition: exec-env.h:76