doc/html/query-resource-mgr_8cc_source.html

 // Copyright 2012 Cloudera Inc.

 //

 // Licensed under the Apache License, Version 2.0 (the "License");

 // you may not use this file except in compliance with the License.

 // You may obtain a copy of the License at

 //

 // http://www.apache.org/licenses/LICENSE-2.0

 //

 // Unless required by applicable law or agreed to in writing, software

 // distributed under the License is distributed on an "AS IS" BASIS,

 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 // See the License for the specific language governing permissions and

 // limitations under the License.


 #include "statestore/query-resource-mgr.h"


 #include <boost/foreach.hpp>

 #include <boost/uuid/uuid.hpp>

 #include <boost/uuid/uuid_generators.hpp>

 #include <gutil/strings/substitute.h>

 #include <sstream>


 #include "runtime/exec-env.h"

 #include "resourcebroker/resource-broker.h"

 #include "util/container-util.h"

 #include "util/network-util.h"

 #include "util/promise.h"

 #include "util/time.h"


 #include "common/names.h"


 using boost::uuids::random_generator;

 using boost::uuids::uuid;

 using namespace impala;

 using namespace strings;


 const int64_t DEFAULT_EXPANSION_REQUEST_TIMEOUT_MS = 5000;


 DEFINE_double(max_vcore_oversubscription_ratio, 2.5, "(Advanced) The maximum ratio "

     "allowed between running threads and acquired VCore resources for a query's fragments"

     " on a single node");


 ResourceResolver::ResourceResolver(const unordered_set<TNetworkAddress>& unique_hosts) {

   if (ExecEnv::GetInstance()->is_pseudo_distributed_llama()) {

     CreateLocalLlamaNodeMapping(unique_hosts);

   }

 }


 void ResourceResolver::GetResourceHostport(const TNetworkAddress& src,

     TNetworkAddress* dest) {

   if (ExecEnv::GetInstance()->is_pseudo_distributed_llama()) {

     *dest = impalad_to_dn_[src];

   } else {

     dest->hostname = src.hostname;

     dest->port = 0;

   }

 }


 void ResourceResolver::CreateLocalLlamaNodeMapping(

     const unordered_set<TNetworkAddress>& unique_hosts) {

   DCHECK(ExecEnv::GetInstance()->is_pseudo_distributed_llama());

   const vector<string>& llama_nodes =

       ExecEnv::GetInstance()->resource_broker()->llama_nodes();

   DCHECK(!llama_nodes.empty());

   int llama_node_ix = 0;

   BOOST_FOREACH(const TNetworkAddress& host, unique_hosts) {

     TNetworkAddress dn_hostport = MakeNetworkAddress(llama_nodes[llama_node_ix]);

     impalad_to_dn_[host] = dn_hostport;

     dn_to_impalad_[dn_hostport] = host;

     LOG(INFO) << "Mapping Datanode " << dn_hostport << " to Impalad: " << host;

     // Round robin the registered Llama nodes.

     llama_node_ix = (llama_node_ix + 1) % llama_nodes.size();

   }

 }


 QueryResourceMgr::QueryResourceMgr(const TUniqueId& reservation_id,

     const TNetworkAddress& local_resource_location, const TUniqueId& query_id)

     : reservation_id_(reservation_id), query_id_(query_id),

       local_resource_location_(local_resource_location), exit_(false), callback_count_(0),

       threads_running_(0), vcores_(0) {

   max_vcore_oversubscription_ratio_ = FLAGS_max_vcore_oversubscription_ratio;

 }


 void QueryResourceMgr::InitVcoreAcquisition(int32_t init_vcores) {

   LOG(INFO) << "Initialising vcore acquisition thread for query " << PrintId(query_id_)

             << " (" << init_vcores << " initial vcores)";

   DCHECK(acquire_vcore_thread_.get() == NULL)

       << "Double initialisation of QueryResourceMgr::InitCpuAcquisition()";

   vcores_ = init_vcores;


   // These shared pointers to atomic values are used to communicate between the vcore

   // acquisition thread and the class destructor. If the acquisition thread is in the

   // middle of an Expand() call, the destructor might have to wait 5s (the default

   // timeout) to return. This holds up query close operations. So instead check to see if

   // the thread is in Expand(), and if so we set a synchronised flag early_exit_ which it

   // inspects immediately after exiting Expand(), and if true, exits before touching any

   // of the class-wide state (because the destructor may have finished before this point).


   thread_in_expand_.reset(new AtomicInt<int16_t>());

   early_exit_.reset(new AtomicInt<int16_t>());

   acquire_vcore_thread_.reset(

       new Thread("resource-mgmt", Substitute("acquire-cpu-$0", PrintId(query_id_)),

           bind<void>(mem_fn(&QueryResourceMgr::AcquireVcoreResources), this,

               thread_in_expand_, early_exit_)));

 }


 Status QueryResourceMgr::CreateExpansionRequest(int64_t memory_mb, int64_t vcores,

     TResourceBrokerExpansionRequest* request) {

   DCHECK(request != NULL);

   DCHECK(memory_mb > 0 || vcores > 0);

   DCHECK(reservation_id_ != TUniqueId()) << "Expansion requires existing reservation";


   unordered_set<TNetworkAddress> hosts;

   hosts.insert(local_resource_location_);

   ResourceResolver resolver(hosts);

   llama::TResource res;

   res.memory_mb = memory_mb;

   res.v_cpu_cores = vcores;

   TNetworkAddress res_address;

   resolver.GetResourceHostport(local_resource_location_, &res_address);

   res.__set_askedLocation(TNetworkAddressToString(res_address));


   random_generator uuid_generator;

   uuid id = uuid_generator();

   res.client_resource_id.hi = *reinterpret_cast<uint64_t*>(&id.data[0]);

   res.client_resource_id.lo = *reinterpret_cast<uint64_t*>(&id.data[8]);

   res.enforcement = llama::TLocationEnforcement::MUST;


   request->__set_resource(res);

   request->__set_reservation_id(reservation_id_);

   request->__set_request_timeout(DEFAULT_EXPANSION_REQUEST_TIMEOUT_MS);


   return Status::OK;

 }


 bool QueryResourceMgr::AboveVcoreSubscriptionThreshold() {

   return threads_running_ > vcores_ * (max_vcore_oversubscription_ratio_ * 0.8);

 }


 void QueryResourceMgr::NotifyThreadUsageChange(int delta) {

   lock_guard<mutex> l(threads_running_lock_);

   threads_running_ += delta;

   DCHECK(threads_running_ >= 0L);

   if (AboveVcoreSubscriptionThreshold()) threads_changed_cv_.notify_all();

 }


 int32_t QueryResourceMgr::AddVcoreAvailableCb(const VcoreAvailableCb& callback) {

   lock_guard<mutex> l(callbacks_lock_);

   callbacks_[callback_count_] = callback;

   callbacks_it_ = callbacks_.begin();

   return callback_count_++;

 }


 void QueryResourceMgr::RemoveVcoreAvailableCb(int32_t callback_id) {

   lock_guard<mutex> l(callbacks_lock_);

   CallbackMap::iterator it = callbacks_.find(callback_id);

   DCHECK(it != callbacks_.end()) << "Could not find callback with id: " << callback_id;

   callbacks_.erase(it);

   callbacks_it_ = callbacks_.begin();

 }


 void QueryResourceMgr::AcquireVcoreResources(

     shared_ptr<AtomicInt<int16_t> > thread_in_expand,

     shared_ptr<AtomicInt<int16_t> > early_exit) {

   // Take a copy because we'd like to print it in some cases after the destructor.

   TUniqueId reservation_id = reservation_id_;

   VLOG_QUERY << "Starting Vcore acquisition for: " << reservation_id;

   while (!ShouldExit()) {

     {

       unique_lock<mutex> l(threads_running_lock_);

       while (!AboveVcoreSubscriptionThreshold() && !ShouldExit()) {

         threads_changed_cv_.wait(l);

       }

     }

     if (ShouldExit()) break;


     TResourceBrokerExpansionRequest request;

     CreateExpansionRequest(0L, 1, &request);

     TResourceBrokerExpansionResponse response;

     VLOG_QUERY << "Expanding VCore allocation: " << reservation_id_;


     // First signal that we are about to enter a blocking Expand() call.

     thread_in_expand->FetchAndUpdate(1L);

     // TODO: Could cause problems if called during or after a system-wide shutdown

     Status status = ExecEnv::GetInstance()->resource_broker()->Expand(request, &response);

     thread_in_expand->FetchAndUpdate(-1L);

     // If signalled to exit quickly by the destructor, exit the loop now. It's important

     // to do so without accessing any class variables since they may no longer be valid.

     if (early_exit->FetchAndUpdate(0L) != 0) {

       VLOG_QUERY << "Fragment finished during Expand(): " << reservation_id;

       break;

     }

     if (!status.ok()) {

       VLOG_QUERY << "Could not expand CPU resources for query " << PrintId(query_id_)

                  << ", reservation: " << PrintId(reservation_id_) << ". Error was: "

                  << status.GetDetail();

       // Sleep to avoid flooding the resource broker, particularly if requests are being

       // rejected quickly (and therefore we stay oversubscribed)

       // TODO: configurable timeout

       SleepForMs(250);

       continue;

     }


     const llama::TAllocatedResource& resource =

         response.allocated_resources.begin()->second;

     DCHECK(resource.v_cpu_cores == 1)

         << "Asked for 1 core, got: " << resource.v_cpu_cores;

     vcores_ += resource.v_cpu_cores;


     ExecEnv* exec_env = ExecEnv::GetInstance();

     const string& cgroup =

         exec_env->cgroups_mgr()->UniqueIdToCgroup(PrintId(query_id_, "_"));

     int32_t num_shares = exec_env->cgroups_mgr()->VirtualCoresToCpuShares(vcores_);

     exec_env->cgroups_mgr()->SetCpuShares(cgroup, num_shares);


     // TODO: Only call one callback no matter how many VCores we just added; maybe call

     // all of them?

     {

       lock_guard<mutex> l(callbacks_lock_);

       if (callbacks_.size() != 0) {

         callbacks_it_->second();

         if (++callbacks_it_ == callbacks_.end()) callbacks_it_ = callbacks_.begin();

       }

     }

   }

   VLOG_QUERY << "Leaving VCore acquisition thread: " << reservation_id;

 }


 bool QueryResourceMgr::ShouldExit() {

   lock_guard<mutex> l(exit_lock_);

   return exit_;

 }


 void QueryResourceMgr::Shutdown() {

   {

     lock_guard<mutex> l(exit_lock_);

     if (exit_) return;

     exit_ = true;

   }

   {

     lock_guard<mutex> l(callbacks_lock_);

     callbacks_.clear();

   }

   threads_changed_cv_.notify_all();


   // Delete all non-reservation requests associated with this reservation ID. If this the

   // coordinator, the SimpleScheduler will actually release the resources by releasing the

   // original reservation ID.

   ExecEnv::GetInstance()->resource_broker()->ClearRequests(reservation_id_, false);

 }


 QueryResourceMgr::~QueryResourceMgr() {

   if (acquire_vcore_thread_.get() == NULL) return;

   if (!ShouldExit()) Shutdown();

   // First, set the early exit flag. Then check to see if the thread is in Expand(). If

   // so, the acquisition thread is guaranteed to see early_exit_ == 1L once it finishes

   // Expand(), and will exit immediately. It's therefore safe not to wait for it.

   early_exit_->FetchAndUpdate(1L);

   if (thread_in_expand_->FetchAndUpdate(0L) == 0L) {

     acquire_vcore_thread_->Join();

   }

 }

impala::ResourceResolver::GetResourceHostport
void GetResourceHostport(const TNetworkAddress &src, TNetworkAddress *dst)
Definition: query-resource-mgr.cc:49

impala::QueryResourceMgr::local_resource_location_
TNetworkAddress local_resource_location_
Definition: query-resource-mgr.h:151

impala::QueryResourceMgr::vcores_
int64_t vcores_
The number of VCores acquired for this node for this query.
Definition: query-resource-mgr.h:181

impala::QueryResourceMgr::thread_in_expand_
boost::shared_ptr< AtomicInt< int16_t > > thread_in_expand_
Definition: query-resource-mgr.h:201

impala::QueryResourceMgr::AddVcoreAvailableCb
int32_t AddVcoreAvailableCb(const VcoreAvailableCb &callback)
Definition: query-resource-mgr.cc:147

impala::Status::GetDetail
const std::string GetDetail() const
Definition: status.cc:184

impala::TNetworkAddressToString
string TNetworkAddressToString(const TNetworkAddress &address)
Utility method to print address as address:port.
Definition: network-util.cc:126

impala::CgroupsMgr::VirtualCoresToCpuShares
int32_t VirtualCoresToCpuShares(int16_t v_cpu_cores)
Definition: cgroups-mgr.cc:59

DEFINE_double
DEFINE_double(max_vcore_oversubscription_ratio, 2.5,"(Advanced) The maximum ratio ""allowed between running threads and acquired VCore resources for a query's fragments"" on a single node")

impala::ResourceResolver::ResourceResolver
ResourceResolver(const boost::unordered_set< TNetworkAddress > &unique_hosts)
Definition: query-resource-mgr.cc:43

impala::QueryResourceMgr::AboveVcoreSubscriptionThreshold
bool AboveVcoreSubscriptionThreshold()
Definition: query-resource-mgr.cc:136

impala::QueryResourceMgr::exit_lock_
boost::mutex exit_lock_
Used to control shutdown of AcquireCpuResources().
Definition: query-resource-mgr.h:154

impala::Thread
TODO: Consider allowing fragment IDs as category parameters.
Definition: thread.h:45

time.h

impala::query_id_
TUniqueId query_id_
Definition: coordinator.h:194

impala::query_id
const TUniqueId & query_id() const
Definition: coordinator.h:152

impala::CgroupsMgr::UniqueIdToCgroup
std::string UniqueIdToCgroup(const std::string &unique_id) const
Definition: cgroups-mgr.cc:54

impala::QueryResourceMgr::threads_changed_cv_
boost::condition_variable threads_changed_cv_
Waited on by AcquireCpuResources(), and notified by NotifyThreadUsageChange().
Definition: query-resource-mgr.h:175

query-resource-mgr.h

impala::ResourceResolver::CreateLocalLlamaNodeMapping
void CreateLocalLlamaNodeMapping(const boost::unordered_set< TNetworkAddress > &unique_hosts)
Definition: query-resource-mgr.cc:59

impala::QueryResourceMgr::Shutdown
void Shutdown()
Definition: query-resource-mgr.cc:234

impala::CgroupsMgr::SetCpuShares
Status SetCpuShares(const std::string &cgroup, int32_t num_shares)
Definition: cgroups-mgr.cc:101

impala::MakeNetworkAddress
TNetworkAddress MakeNetworkAddress(const string &hostname, int port)
Definition: network-util.cc:96

impala::AtomicInt
Definition: atomic.h:46

impala::PrintId
string PrintId(const TUniqueId &id, const string &separator)
Definition: debug-util.cc:97

impala::QueryResourceMgr::threads_running_
int64_t threads_running_
The number of threads we know to be running on behalf of this query.
Definition: query-resource-mgr.h:178

impala::SleepForMs
void SleepForMs(const int64_t duration_ms)
Sleeps the current thread for at least duration_ms milliseconds.
Definition: time.cc:21

impala::QueryResourceMgr::QueryResourceMgr
QueryResourceMgr(const TUniqueId &reservation_id, const TNetworkAddress &local_resource_location, const TUniqueId &query_id)
Definition: query-resource-mgr.cc:76

impala::ExecEnv::resource_broker
ResourceBroker * resource_broker()
Definition: exec-env.h:95

impala::QueryResourceMgr::reservation_id_
TUniqueId reservation_id_
ID of the single reservation corresponding to this query.
Definition: query-resource-mgr.h:143

impala::ResourceBroker::ClearRequests
void ClearRequests(const TUniqueId &reservation_id, bool include_reservation)
Definition: resource-broker.cc:636

impala::QueryResourceMgr::callbacks_it_
CallbackMap::iterator callbacks_it_
Round-robin iterator to notify callbacks about new VCores one at a time.
Definition: query-resource-mgr.h:165

impala::QueryResourceMgr::VcoreAvailableCb
boost::function< void()> VcoreAvailableCb
Definition: query-resource-mgr.h:122

impala::Status
Definition: status.h:81

impala::ExecEnv::cgroups_mgr
CgroupsMgr * cgroups_mgr()
Definition: exec-env.h:88

VLOG_QUERY
#define VLOG_QUERY
Definition: logging.h:57

impala::QueryResourceMgr::threads_running_lock_
boost::mutex threads_running_lock_
Protects threads_running_, threads_changed_cv_ and vcores_.
Definition: query-resource-mgr.h:172

impala::QueryResourceMgr::~QueryResourceMgr
~QueryResourceMgr()
Waits for the VCore acquisition thread to stop.
Definition: query-resource-mgr.cc:252

impala::QueryResourceMgr::query_id_
TUniqueId query_id_
Query ID of the query this class manages resources for.
Definition: query-resource-mgr.h:146

promise.h

impala::QueryResourceMgr::callback_count_
int32_t callback_count_
Definition: query-resource-mgr.h:169

uint64_t

resource-broker.h

impala::QueryResourceMgr::exit_
bool exit_
Definition: query-resource-mgr.h:155

impala::QueryResourceMgr::callbacks_
CallbackMap callbacks_
Definition: query-resource-mgr.h:162

impala::QueryResourceMgr::RemoveVcoreAvailableCb
void RemoveVcoreAvailableCb(int32_t callback_id)
Removes the callback with the given ID.
Definition: query-resource-mgr.cc:154

impala::QueryResourceMgr::acquire_vcore_thread_
boost::scoped_ptr< Thread > acquire_vcore_thread_
Runs AcquireVcoreResources() after InitVcoreAcquisition() is called.
Definition: query-resource-mgr.h:189

impala::ExecEnv::GetInstance
static ExecEnv * GetInstance()
Definition: exec-env.h:63

impala::QueryResourceMgr::AcquireVcoreResources
void AcquireVcoreResources(boost::shared_ptr< AtomicInt< int16_t > > thread_in_expand, boost::shared_ptr< AtomicInt< int16_t > > early_exit)
Definition: query-resource-mgr.cc:162

network-util.h

DEFAULT_EXPANSION_REQUEST_TIMEOUT_MS
const int64_t DEFAULT_EXPANSION_REQUEST_TIMEOUT_MS
Definition: query-resource-mgr.cc:37

container-util.h

impala::Status::OK
static const Status OK
Definition: status.h:87

impala::QueryResourceMgr::NotifyThreadUsageChange
void NotifyThreadUsageChange(int delta)
Definition: query-resource-mgr.cc:140

impala::ResourceResolver
Definition: query-resource-mgr.h:40

impala::ResourceBroker::Expand
Status Expand(const TResourceBrokerExpansionRequest &request, TResourceBrokerExpansionResponse *response)
Definition: resource-broker.cc:503

names.h

impala::QueryResourceMgr::InitVcoreAcquisition
void InitVcoreAcquisition(int32_t init_vcores)
Definition: query-resource-mgr.cc:84

impala::QueryResourceMgr::early_exit_
boost::shared_ptr< AtomicInt< int16_t > > early_exit_
Definition: query-resource-mgr.h:196

impala::QueryResourceMgr::callbacks_lock_
boost::mutex callbacks_lock_
Protects callbacks_ and callbacks_it_.
Definition: query-resource-mgr.h:158

impala::ExecEnv
Definition: exec-env.h:53

impala::Status::ok
bool ok() const
Definition: status.h:172

impala::ResourceBroker::llama_nodes
const std::vector< std::string > & llama_nodes()
Definition: resource-broker.h:91

impala::QueryResourceMgr::CreateExpansionRequest
Status CreateExpansionRequest(int64_t memory_mb, int64_t vcores, TResourceBrokerExpansionRequest *request)
Definition: query-resource-mgr.cc:107

impala::QueryResourceMgr::max_vcore_oversubscription_ratio_
float max_vcore_oversubscription_ratio_
Definition: query-resource-mgr.h:186

impala::QueryResourceMgr::ShouldExit
bool ShouldExit()
Notifies acquire_cpu_thread_ that it should terminate. Does not block.
Definition: query-resource-mgr.cc:229

exec-env.h