Impala
Impalaistheopensource,nativeanalyticdatabaseforApacheHadoop.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
impala::SimpleScheduler Class Reference

#include <simple-scheduler.h>

Inheritance diagram for impala::SimpleScheduler:
Collaboration diagram for impala::SimpleScheduler:

Public Types

typedef std::vector
< TBackendDescriptor > 
BackendList
 List of server descriptors. More...
 

Public Member Functions

 SimpleScheduler (StatestoreSubscriber *subscriber, const std::string &backend_id, const TNetworkAddress &backend_address, MetricGroup *metrics, Webserver *webserver, ResourceBroker *resource_broker, RequestPoolService *request_pool_service)
 
 SimpleScheduler (const std::vector< TNetworkAddress > &backends, MetricGroup *metrics, Webserver *webserver, ResourceBroker *resource_broker, RequestPoolService *request_pool_service)
 
virtual Status GetBackends (const std::vector< TNetworkAddress > &data_locations, BackendList *backends)
 
virtual impala::Status GetBackend (const TNetworkAddress &data_location, TBackendDescriptor *backend)
 
virtual void GetAllKnownBackends (BackendList *backends)
 Return a list of all backends known to the scheduler. More...
 
virtual bool HasLocalBackend (const TNetworkAddress &data_location)
 Return true if there is a backend located on the given data_location. More...
 
virtual impala::Status Init ()
 Registers with the subscription manager if required. More...
 
virtual Status Schedule (Coordinator *coord, QuerySchedule *schedule)
 
virtual Status Release (QuerySchedule *schedule)
 Releases the reserved resources (if any) from the given schedule. More...
 
virtual void HandlePreemptedReservation (const TUniqueId &reservation_id)
 
virtual void HandlePreemptedResource (const TUniqueId &client_resource_id)
 
virtual void HandleLostResource (const TUniqueId &client_resource_id)
 

Static Public Attributes

static const std::string IMPALA_MEMBERSHIP_TOPIC
 

Private Types

typedef boost::unordered_map
< std::string, std::list
< TBackendDescriptor > > 
BackendMap
 Map from a datanode's IP address to a list of backend addresses running on that node. More...
 
typedef boost::unordered_map
< std::string, std::string > 
BackendIpAddressMap
 
typedef boost::unordered_map
< std::string,
TBackendDescriptor > 
BackendIdMap
 
typedef boost::unordered_map
< TUniqueId, Coordinator * > 
ActiveReservationsMap
 
typedef boost::unordered_map
< TUniqueId, Coordinator * > 
ActiveClientResourcesMap
 

Private Member Functions

void AddToActiveResourceMaps (const TResourceBrokerReservationResponse &reservation, Coordinator *coord)
 
void RemoveFromActiveResourceMaps (const TResourceBrokerReservationResponse &reservation)
 
void UpdateMembership (const StatestoreSubscriber::TopicDeltaMap &incoming_topic_deltas, std::vector< TTopicDelta > *subscriber_topic_updates)
 Called asynchronously when an update is received from the subscription manager. More...
 
void BackendsUrlCallback (const Webserver::ArgumentMap &args, rapidjson::Document *document)
 
Status GetRequestPool (const std::string &user, const TQueryOptions &query_options, std::string *pool) const
 Determines the pool for a user and query options via request_pool_service_. More...
 
Status ComputeScanRangeAssignment (const TQueryExecRequest &exec_request, QuerySchedule *schedule)
 
Status ComputeScanRangeAssignment (PlanNodeId node_id, const std::vector< TScanRangeLocations > &locations, const std::vector< TNetworkAddress > &host_list, bool exec_at_coord, const TQueryOptions &query_options, FragmentScanRangeAssignment *assignment)
 
void ComputeFragmentExecParams (const TQueryExecRequest &exec_request, QuerySchedule *schedule)
 Populates fragment_exec_params_ in schedule. More...
 
void ComputeFragmentHosts (const TQueryExecRequest &exec_request, QuerySchedule *schedule)
 
PlanNodeId FindLeftmostNode (const TPlan &plan, const std::vector< TPlanNodeType::type > &types)
 
int FindLeftmostInputFragment (int fragment_idx, const TQueryExecRequest &exec_request)
 
void GetScanHosts (TPlanNodeId scan_id, const TQueryExecRequest &exec_request, const FragmentExecParams &params, std::vector< TNetworkAddress > *scan_hosts)
 Adds all hosts the given scan is executed on to scan_hosts. More...
 
bool ContainsNode (const TPlan &plan, TPlanNodeType::type type)
 Returns true if 'plan' contains a node of the given type. More...
 
void FindNodes (const TPlan &plan, const std::vector< TPlanNodeType::type > &types, std::vector< TPlanNodeId > *results)
 Returns all ids of nodes in 'plan' of any of the given types. More...
 
int FindSenderFragment (TPlanNodeId exch_id, int fragment_idx, const TQueryExecRequest &exec_request)
 

Private Attributes

boost::mutex backend_map_lock_
 
BackendMap backend_map_
 
BackendIpAddressMap backend_ip_map_
 
BackendIdMap current_membership_
 
MetricGroupmetrics_
 MetricGroup subsystem access. More...
 
Webserverwebserver_
 Webserver for /backends. Not owned by us. More...
 
BackendMap::iterator next_nonlocal_backend_entry_
 round robin entry in BackendMap for non-local host assignment More...
 
StatestoreSubscriberstatestore_subscriber_
 
const std::string backend_id_
 Unique - across the cluster - identifier for this impala backend. More...
 
TBackendDescriptor backend_descriptor_
 Describes this backend, including the Impalad service address. More...
 
ThriftSerializer thrift_serializer_
 
IntCountertotal_assignments_
 Locality metrics. More...
 
IntCountertotal_local_assignments_
 
BooleanPropertyinitialised_
 Initialisation metric. More...
 
IntGaugenum_backends_metric_
 Current number of backends. More...
 
uint32_t update_count_
 Counts the number of UpdateMembership invocations, to help throttle the logging. More...
 
boost::mutex active_resources_lock_
 Protects active_reservations_ and active_client_resources_. More...
 
ActiveReservationsMap active_reservations_
 
ActiveClientResourcesMap active_client_resources_
 
ResourceBrokerresource_broker_
 
RequestPoolServicerequest_pool_service_
 
boost::scoped_ptr
< AdmissionController
admission_controller_
 Used to make admission decisions in 'Schedule()'. More...
 

Detailed Description

Performs simple scheduling by matching between a list of backends configured either from the statestore, or from a static list of addresses, and a list of target data locations. TODO: Notice when there are duplicate statestore registrations (IMPALA-23) TODO: Handle deltas from the statestore

Definition at line 46 of file simple-scheduler.h.

Member Typedef Documentation

typedef boost::unordered_map<TUniqueId, Coordinator*> impala::SimpleScheduler::ActiveClientResourcesMap
private

Maps from client resource id to the coordinator of the query using that resource. The map is used to cancel queries whose resource(s) have been preempted. Entries are added in Schedule() calls that result in granted resource allocations. Entries are removed in Release().

Definition at line 167 of file simple-scheduler.h.

typedef boost::unordered_map<TUniqueId, Coordinator*> impala::SimpleScheduler::ActiveReservationsMap
private

Maps from a Llama reservation id to the coordinator of the query using that reservation. The map is used to cancel queries whose reservation has been preempted. Entries are added in Schedule() calls that result in granted resource allocations. Entries are removed in Release().

Definition at line 160 of file simple-scheduler.h.

typedef boost::unordered_map<std::string, TBackendDescriptor> impala::SimpleScheduler::BackendIdMap
private

Map from unique backend id to TBackendDescriptor. Used to track the known backends from the statestore. It's important to track both the backend ID as well as the TBackendDescriptor so we know what is being removed in a given update. Locking of this map is not needed since it should only be read/modified from within the UpdateMembership() function.

Definition at line 116 of file simple-scheduler.h.

typedef boost::unordered_map<std::string, std::string> impala::SimpleScheduler::BackendIpAddressMap
private

Map from a datanode's hostname to its IP address to support both hostname based lookup.

Definition at line 108 of file simple-scheduler.h.

typedef std::vector<TBackendDescriptor> impala::Scheduler::BackendList
inherited

List of server descriptors.

Definition at line 45 of file scheduler.h.

typedef boost::unordered_map<std::string, std::list<TBackendDescriptor> > impala::SimpleScheduler::BackendMap
private

Map from a datanode's IP address to a list of backend addresses running on that node.

Definition at line 103 of file simple-scheduler.h.

Constructor & Destructor Documentation

impala::SimpleScheduler::SimpleScheduler ( StatestoreSubscriber subscriber,
const std::string &  backend_id,
const TNetworkAddress &  backend_address,
MetricGroup metrics,
Webserver webserver,
ResourceBroker resource_broker,
RequestPoolService request_pool_service 
)

Initialize with a subscription manager that we can register with for updates to the set of available backends.

  • backend_id - unique identifier for this Impala backend (usually a host:port)
  • backend_address - the address that this backend listens on
impala::SimpleScheduler::SimpleScheduler ( const std::vector< TNetworkAddress > &  backends,
MetricGroup metrics,
Webserver webserver,
ResourceBroker resource_broker,
RequestPoolService request_pool_service 
)

Initialize with a list of <host:port> pairs in 'static' mode - i.e. the set of backends is fixed and will not be updated.

Member Function Documentation

void impala::SimpleScheduler::AddToActiveResourceMaps ( const TResourceBrokerReservationResponse &  reservation,
Coordinator *  coord 
)
private

Adds the granted reservation and resources to the active_reservations_ and active_client_resources_ maps, respectively.

Definition at line 916 of file simple-scheduler.cc.

void impala::SimpleScheduler::BackendsUrlCallback ( const Webserver::ArgumentMap args,
rapidjson::Document *  document 
)
private

Webserver callback that produces a list of known backends. Example output: "backends": [ "henry-metrics-pkg-cdh5.ent.cloudera.com:22000" ],

Definition at line 252 of file simple-scheduler.cc.

References backends, and impala::TNetworkAddressToString().

void impala::SimpleScheduler::ComputeFragmentExecParams ( const TQueryExecRequest &  exec_request,
QuerySchedule schedule 
)
private
void impala::SimpleScheduler::ComputeFragmentHosts ( const TQueryExecRequest &  exec_request,
QuerySchedule schedule 
)
private

For each fragment in exec_request, computes hosts on which to run the instances and stores result in fragment_exec_params_.hosts.

Definition at line 658 of file simple-scheduler.cc.

References impala::QuerySchedule::exec_params(), impala::FragmentExecParams::hosts, impala::MakeNetworkAddress(), and impala::QuerySchedule::SetUniqueHosts().

Status impala::SimpleScheduler::ComputeScanRangeAssignment ( const TQueryExecRequest &  exec_request,
QuerySchedule schedule 
)
private

Computes the assignment of scan ranges to hosts for each scan node in schedule. Unpartitioned fragments are assigned to the coord. Populates the schedule's fragment_exec_params_ with the resulting scan range assignment.

Definition at line 438 of file simple-scheduler.cc.

References impala::QuerySchedule::AddScanRanges(), impala::QuerySchedule::exec_params(), impala::QuerySchedule::GetFragmentIdx(), impala::OK, impala::QuerySchedule::query_options(), and RETURN_IF_ERROR.

Status impala::SimpleScheduler::ComputeScanRangeAssignment ( PlanNodeId  node_id,
const std::vector< TScanRangeLocations > &  locations,
const std::vector< TNetworkAddress > &  host_list,
bool  exec_at_coord,
const TQueryOptions &  query_options,
FragmentScanRangeAssignment assignment 
)
private

Does a scan range assignment (returned in 'assignment') based on a list of scan range locations for a particular scan node. If exec_at_coord is true, all scan ranges will be assigned to the coord node.

bool impala::SimpleScheduler::ContainsNode ( const TPlan &  plan,
TPlanNodeType::type  type 
)
private

Returns true if 'plan' contains a node of the given type.

Definition at line 758 of file simple-scheduler.cc.

int impala::SimpleScheduler::FindLeftmostInputFragment ( int  fragment_idx,
const TQueryExecRequest &  exec_request 
)
private

Returns the index (w/in exec_request.fragments) of fragment that sends its output to exec_request.fragment[fragment_idx]'s leftmost ExchangeNode. Returns INVALID_PLAN_NODE_ID if the leftmost node is not an exchange node.

Definition at line 798 of file simple-scheduler.cc.

PlanNodeId impala::SimpleScheduler::FindLeftmostNode ( const TPlan &  plan,
const std::vector< TPlanNodeType::type > &  types 
)
private

Returns the id of the leftmost node of any of the given types in 'plan', or INVALID_PLAN_NODE_ID if no such node present.

Definition at line 740 of file simple-scheduler.cc.

void impala::SimpleScheduler::FindNodes ( const TPlan &  plan,
const std::vector< TPlanNodeType::type > &  types,
std::vector< TPlanNodeId > *  results 
)
private

Returns all ids of nodes in 'plan' of any of the given types.

Definition at line 765 of file simple-scheduler.cc.

int impala::SimpleScheduler::FindSenderFragment ( TPlanNodeId  exch_id,
int  fragment_idx,
const TQueryExecRequest &  exec_request 
)
private

Returns the index (w/in exec_request.fragments) of fragment that sends its output to the given exchange in the given fragment index.

Definition at line 812 of file simple-scheduler.cc.

void impala::SimpleScheduler::GetAllKnownBackends ( BackendList backends)
virtual

Return a list of all backends known to the scheduler.

Implements impala::Scheduler.

Definition at line 429 of file simple-scheduler.cc.

Status impala::SimpleScheduler::GetBackend ( const TNetworkAddress &  data_location,
TBackendDescriptor *  backend 
)
virtual

Return a backend such that the impalad at backend.address should be used to read data from the given data_loation

Implements impala::Scheduler.

Definition at line 376 of file simple-scheduler.cc.

References impala::OK, VLOG_FILE, and VLOG_FILE_IS_ON.

Status impala::SimpleScheduler::GetBackends ( const std::vector< TNetworkAddress > &  data_locations,
BackendList backends 
)
virtual

Returns a list of backends such that the impalad at backends[i] should be used to read data from data_locations[i]. For each data_location, we choose a backend whose host matches the data_location in a round robin fashion and insert it into backends. If no match is found for a data location, assign the data location in round-robin order to any of the backends. If the set of available backends is updated between calls, round-robin state is reset.

Implements impala::Scheduler.

Definition at line 364 of file simple-scheduler.cc.

References impala::OK.

Status impala::SimpleScheduler::GetRequestPool ( const std::string &  user,
const TQueryOptions &  query_options,
std::string *  pool 
) const
private

Determines the pool for a user and query options via request_pool_service_.

Definition at line 826 of file simple-scheduler.cc.

References impala::ERROR_USER_NOT_ALLOWED_IN_POOL(), impala::ERROR_USER_TO_POOL_MAPPING_NOT_FOUND(), impala::OK, and RETURN_IF_ERROR.

void impala::SimpleScheduler::GetScanHosts ( TPlanNodeId  scan_id,
const TQueryExecRequest &  exec_request,
const FragmentExecParams params,
std::vector< TNetworkAddress > *  scan_hosts 
)
private

Adds all hosts the given scan is executed on to scan_hosts.

Definition at line 777 of file simple-scheduler.cc.

References impala::MakeNetworkAddress(), and impala::FragmentExecParams::scan_range_assignment.

void impala::SimpleScheduler::HandleLostResource ( const TUniqueId &  client_resource_id)
virtual

Notifies this scheduler that a single resource with the given client resource id has been lost by the central scheduler (Yarn via Llama). All affected queries are cancelled via their coordinator.

Implements impala::Scheduler.

Definition at line 981 of file simple-scheduler.cc.

void impala::SimpleScheduler::HandlePreemptedReservation ( const TUniqueId &  reservation_id)
virtual

Notifies this scheduler that a resource reservation has been preempted by the central scheduler (Yarn via Llama). All affected queries are cancelled via their coordinator.

Implements impala::Scheduler.

Definition at line 944 of file simple-scheduler.cc.

void impala::SimpleScheduler::HandlePreemptedResource ( const TUniqueId &  client_resource_id)
virtual

Notifies this scheduler that a single resource with the given client resource id has been preempted by the central scheduler (Yarn via Llama). All affected queries are cancelled via their coordinator.

Implements impala::Scheduler.

Definition at line 962 of file simple-scheduler.cc.

virtual bool impala::SimpleScheduler::HasLocalBackend ( const TNetworkAddress &  data_location)
inlinevirtual

Return true if there is a backend located on the given data_location.

Implements impala::Scheduler.

Definition at line 81 of file simple-scheduler.h.

References backend_map_, and backend_map_lock_.

Status impala::SimpleScheduler::Release ( QuerySchedule schedule)
virtual

Releases the reserved resources (if any) from the given schedule.

Implements impala::Scheduler.

Definition at line 896 of file simple-scheduler.cc.

References impala::QuerySchedule::NeedsRelease(), impala::OK, impala::QuerySchedule::reservation(), and RETURN_IF_ERROR.

void impala::SimpleScheduler::RemoveFromActiveResourceMaps ( const TResourceBrokerReservationResponse &  reservation)
private

Removes the given reservation and resources from the active_reservations_ and active_client_resources_ maps, respectively.

Definition at line 930 of file simple-scheduler.cc.

Status impala::SimpleScheduler::Schedule ( Coordinator *  coord,
QuerySchedule schedule 
)
virtual

Populates given query schedule whose execution is to be coordinated by coord. Assigns fragments to hosts based on scan ranges in the query exec request. If resource management is enabled, also reserves resources from the central resource manager (Yarn via Llama) to run the query in. This function blocks until the reservation request has been granted or denied.

Implements impala::Scheduler.

Definition at line 847 of file simple-scheduler.cc.

References impala::Status::AddDetail(), impala::DEFAULT_USER(), impala::QuerySchedule::effective_user(), impala::ERROR_USER_NOT_SPECIFIED(), impala::GetTablesMissingStatsWarning(), impala::OK, impala::Status::ok(), pool, impala::QuerySchedule::PrepareReservationRequest(), impala::QuerySchedule::query_options(), impala::QuerySchedule::request(), impala::QuerySchedule::reservation(), impala::QuerySchedule::reservation_request(), RETURN_IF_ERROR, impala::QuerySchedule::set_num_hosts(), impala::QuerySchedule::set_request_pool(), and impala::QuerySchedule::ValidateReservation().

void impala::SimpleScheduler::UpdateMembership ( const StatestoreSubscriber::TopicDeltaMap incoming_topic_deltas,
std::vector< TTopicDelta > *  subscriber_topic_updates 
)
private

Called asynchronously when an update is received from the subscription manager.

Definition at line 265 of file simple-scheduler.cc.

References DeserializeThriftMsg(), impala::Status::GetDetail(), and impala::Status::ok().

Member Data Documentation

ActiveClientResourcesMap impala::SimpleScheduler::active_client_resources_
private

Definition at line 168 of file simple-scheduler.h.

ActiveReservationsMap impala::SimpleScheduler::active_reservations_
private

Definition at line 161 of file simple-scheduler.h.

boost::mutex impala::SimpleScheduler::active_resources_lock_
private

Protects active_reservations_ and active_client_resources_.

Definition at line 154 of file simple-scheduler.h.

boost::scoped_ptr<AdmissionController> impala::SimpleScheduler::admission_controller_
private

Used to make admission decisions in 'Schedule()'.

Definition at line 179 of file simple-scheduler.h.

TBackendDescriptor impala::SimpleScheduler::backend_descriptor_
private

Describes this backend, including the Impalad service address.

Definition at line 137 of file simple-scheduler.h.

const std::string impala::SimpleScheduler::backend_id_
private

Unique - across the cluster - identifier for this impala backend.

Definition at line 134 of file simple-scheduler.h.

BackendIpAddressMap impala::SimpleScheduler::backend_ip_map_
private

Definition at line 109 of file simple-scheduler.h.

BackendMap impala::SimpleScheduler::backend_map_
private

Definition at line 104 of file simple-scheduler.h.

Referenced by HasLocalBackend().

boost::mutex impala::SimpleScheduler::backend_map_lock_
private

Protects access to backend_map_ and backend_ip_map_, which might otherwise be updated asynchronously with respect to reads. Also protects the locality counters, which are updated in GetBackends.

Definition at line 100 of file simple-scheduler.h.

Referenced by HasLocalBackend().

BackendIdMap impala::SimpleScheduler::current_membership_
private

Definition at line 117 of file simple-scheduler.h.

const string impala::SimpleScheduler::IMPALA_MEMBERSHIP_TOPIC
static
BooleanProperty* impala::SimpleScheduler::initialised_
private

Initialisation metric.

Definition at line 146 of file simple-scheduler.h.

MetricGroup* impala::SimpleScheduler::metrics_
private

MetricGroup subsystem access.

Definition at line 120 of file simple-scheduler.h.

BackendMap::iterator impala::SimpleScheduler::next_nonlocal_backend_entry_
private

round robin entry in BackendMap for non-local host assignment

Definition at line 126 of file simple-scheduler.h.

IntGauge* impala::SimpleScheduler::num_backends_metric_
private

Current number of backends.

Definition at line 148 of file simple-scheduler.h.

RequestPoolService* impala::SimpleScheduler::request_pool_service_
private

Used for user-to-pool resolution and looking up pool configurations. Not owned by us.

Definition at line 176 of file simple-scheduler.h.

ResourceBroker* impala::SimpleScheduler::resource_broker_
private

Resource broker that mediates resource requests between Impala and the Llama. Set to NULL if resource management is disabled.

Definition at line 172 of file simple-scheduler.h.

StatestoreSubscriber* impala::SimpleScheduler::statestore_subscriber_
private

Pointer to a subscription manager (which we do not own) which is used to register for dynamic updates to the set of available backends. May be NULL if the set of backends is fixed.

Definition at line 131 of file simple-scheduler.h.

ThriftSerializer impala::SimpleScheduler::thrift_serializer_
private

Definition at line 139 of file simple-scheduler.h.

IntCounter* impala::SimpleScheduler::total_assignments_
private

Locality metrics.

Definition at line 142 of file simple-scheduler.h.

IntCounter* impala::SimpleScheduler::total_local_assignments_
private

Definition at line 143 of file simple-scheduler.h.

uint32_t impala::SimpleScheduler::update_count_
private

Counts the number of UpdateMembership invocations, to help throttle the logging.

Definition at line 151 of file simple-scheduler.h.

Webserver* impala::SimpleScheduler::webserver_
private

Webserver for /backends. Not owned by us.

Definition at line 123 of file simple-scheduler.h.


The documentation for this class was generated from the following files: