#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2015-2015: Alignak team, see AUTHORS.txt file for contributors
#
# This file is part of Alignak.
#
# Alignak is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Alignak is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with Alignak. If not, see <http://www.gnu.org/licenses/>.
#
#
# This file incorporates work covered by the following copyright and
# permission notice:
#
# Copyright (C) 2009-2014:
# xkilian, fmikus@acktomic.com
# David Moreau Simard, dmsimard@iweb.com
# Guillaume Bour, guillaume@bour.cc
# aviau, alexandre.viau@savoirfairelinux.com
# Hartmut Goebel, h.goebel@goebel-consult.de
# Nicolas Dupeux, nicolas@dupeux.net
# Bruno Clermont, bruno.clermont@gmail.com
# Grégory Starck, g.starck@gmail.com
# Sebastien Coavoux, s.coavoux@free.fr
# Olivier Hanesse, olivier.hanesse@gmail.com
# Jean Gabes, naparuba@gmail.com
# Zoran Zaric, zz@zoranzaric.de
# Gerhard Lausser, gerhard.lausser@consol.de
# This file is part of Shinken.
#
# Shinken is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Shinken is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with Shinken. If not, see <http://www.gnu.org/licenses/>.
"""
This class is an interface for Reactionner and Poller daemons
A Reactionner listens to a port for the configuration from the Arbiter
The conf contains the schedulers where actionners will gather actions.
The Reactionner keeps on listening to the Arbiter
(one a timeout)
If Arbiter wants it to have a new conf, the satellite forgets the previous
Schedulers (and actions into) and takes the new ones.
"""
# Try to see if we are in an android device or not
import imp
try:
imp.find_module('android')
IS_ANDROID = True
except ImportError:
IS_ANDROID = False
if not IS_ANDROID:
from multiprocessing import Queue, active_children, cpu_count
else:
from Queue import Queue
import os
import copy
import time
import cPickle
import traceback
import zlib
import base64
import threading
from alignak.http.client import HTTPClient, HTTPEXCEPTIONS
from alignak.http.generic_interface import GenericInterface
from alignak.message import Message
from alignak.worker import Worker
from alignak.load import Load
from alignak.daemon import Daemon
from alignak.log import logger
from alignak.stats import statsmgr
[docs]class NotWorkerMod(Exception):
"""Class to tell that we are facing a non worker module
but a standard one
"""
pass
[docs]class BaseSatellite(Daemon):
"""Base Satellite class.
Subclassed by Alignak (scheduler), Broker and Satellite
"""
def __init__(self, name, config_file, is_daemon, do_replace, debug, debug_file):
super(BaseSatellite, self).__init__(name, config_file, is_daemon,
do_replace, debug, debug_file)
# Ours schedulers
self.schedulers = {}
# Now we create the interfaces
self.http_interface = GenericInterface(self)
# Can have a queue of external_commands given by modules
# will be taken by arbiter to process
self.external_commands = []
self.external_commands_lock = threading.RLock()
[docs] def watch_for_new_conf(self, timeout):
"""Triggered by Arbiter get to make the satellite wait new conf
Timeout is short is (1.0 or 0)
:param timeout: timeout to wait
:type timeout: float
:return: None
TODO: Clean this, handle return a tuple and it is not used
"""
self.handle_requests(timeout)
[docs] def what_i_managed(self):
"""Get the managed configuration by this satellite
:return: a dict of scheduler id as key and push_flavor as values
:rtype: dict
"""
res = {}
for (key, val) in self.schedulers.iteritems():
res[key] = val['push_flavor']
return res
[docs] def get_external_commands(self):
"""Get the external commands
:return: External commands list
:rtype: list
"""
res = self.external_commands
self.external_commands = []
return res
[docs] def do_loop_turn(self):
"""Abstract method for satellite loop turn.
It must be overridden by class inheriting from Daemon
:return: None
"""
raise NotImplementedError()
[docs]class Satellite(BaseSatellite):
"""Satellite class.
Subclassed by Receiver, Reactionner and Poller
"""
def __init__(self, name, config_file, is_daemon, do_replace, debug, debug_file):
super(Satellite, self).__init__(name, config_file, is_daemon, do_replace,
debug, debug_file)
# Keep broks so they can be eaten by a broker
self.broks = {}
self.workers = {} # dict of active workers
# Init stats like Load for workers
self.wait_ratio = Load(initial_value=1)
self.slave_q = None
self.returns_queue = None
self.q_by_mod = {}
[docs] def pynag_con_init(self, _id):
"""Wrapped function for do_pynag_con_init
:param _id: scheduler _id to connect to
:type _id: int
:return: scheduler connection object or None
:rtype: alignak.http.client.HTTPClient
"""
_t0 = time.time()
res = self.do_pynag_con_init(_id)
statsmgr.incr('con-init.scheduler', time.time() - _t0)
return res
[docs] def do_pynag_con_init(self, s_id):
"""Initialize a connection with scheduler having '_id'
Return the new connection to the scheduler if it succeeded,
else: any error OR sched is inactive: return None.
NB: if sched is inactive then None is directly returned.
:param s_id: scheduler s_id to connect to
:type s_id: int
:return: scheduler connection object or None
:rtype: alignak.http.client.HTTPClient
"""
sched = self.schedulers[s_id]
if not sched['active']:
return
sname = sched['name']
uri = sched['uri']
running_id = sched['running_id']
timeout = sched['timeout']
data_timeout = sched['data_timeout']
logger.info("[%s] Init connection with %s at %s (%ss,%ss)",
self.name, sname, uri, timeout, data_timeout)
try:
sch_con = sched['con'] = HTTPClient(
uri=uri, strong_ssl=sched['hard_ssl_name_check'],
timeout=timeout, data_timeout=data_timeout)
except HTTPEXCEPTIONS, exp:
logger.warning("[%s] Scheduler %s is not initialized or has network problem: %s",
self.name, sname, str(exp))
sched['con'] = None
return
# timeout of 3s by default (short one)
# and get the running s_id
try:
new_run_id = sch_con.get('get_running_id')
new_run_id = float(new_run_id)
except (HTTPEXCEPTIONS, cPickle.PicklingError, KeyError), exp:
logger.warning("[%s] Scheduler %s is not initialized or has network problem: %s",
self.name, sname, str(exp))
sched['con'] = None
return
# The schedulers have been restarted: it has a new run_id.
# So we clear all verifs, they are obsolete now.
if sched['running_id'] != 0 and new_run_id != running_id:
logger.info("[%s] The running id of the scheduler %s changed, "
"we must clear its actions",
self.name, sname)
sched['wait_homerun'].clear()
sched['running_id'] = new_run_id
logger.info("[%s] Connection OK with scheduler %s", self.name, sname)
return sch_con
[docs] def manage_action_return(self, action):
"""Manage action return from Workers
We just put them into the corresponding sched
and we clean unused properties like sched_id
:param action: the action to manage
:type action: alignak.action.Action
:return: None
"""
# Maybe our workers end us something else than an action
# if so, just add this in other queues and return
cls_type = action.__class__.my_type
if cls_type not in ['check', 'notification', 'eventhandler']:
self.add(action)
return
# Ok, it's a result. We get it, and fill verifs of the good sched_id
sched_id = action.sched_id
# Now we now where to put action, we do not need sched_id anymore
del action.sched_id
# Unset the tag of the worker_id too
try:
del action.worker_id
except AttributeError:
pass
# And we remove it from the actions queue of the scheduler too
try:
del self.schedulers[sched_id]['actions'][action.get_id()]
except KeyError:
pass
# We tag it as "return wanted", and move it in the wait return queue
# Stop, if it is "timeout" we need this information later
# in the scheduler
# action.status = 'waitforhomerun'
try:
self.schedulers[sched_id]['wait_homerun'][action.get_id()] = action
except KeyError:
pass
[docs] def manage_returns(self):
""" Wrapper function of do_manage_returns()
:return: None
TODO: Use a decorator for stat
"""
_t0 = time.time()
self.do_manage_returns()
statsmgr.incr('core.manage-returns', time.time() - _t0)
[docs] def do_manage_returns(self):
"""Manage the checks and then
send a HTTP request to schedulers (POST /put_results)
REF: doc/alignak-action-queues.png (6)
:return: None
"""
# For all schedulers, we check for wait_homerun
# and we send back results
for sched_id, sched in self.schedulers.iteritems():
if not sched['active']:
continue
results = sched['wait_homerun']
# NB: it's **mostly** safe for us to not use some lock around
# this 'results' / sched['wait_homerun'].
# Because it can only be modified (for adding new values) by the
# same thread running this function (that is the main satellite
# thread), and this occurs exactly in self.manage_action_return().
# Another possibility is for the sched['wait_homerun'] to be
# cleared within/by :
# ISchedulers.get_returns() -> Satelitte.get_return_for_passive()
# This can so happen in an (http) client thread.
if not results:
return
# So, at worst, some results would be received twice on the
# scheduler level, which shouldn't be a problem given they are
# indexed by their "action_id".
send_ok = False
try:
con = sched.get('con')
if con is None: # None = not initialized
con = self.pynag_con_init(sched_id)
if con:
con.post('put_results', {'results': results.values()})
send_ok = True
except HTTPEXCEPTIONS as err:
logger.error('Could not send results to scheduler %s : %s',
sched['name'], err)
except Exception as err:
logger.exception("Unhandled exception trying to send results "
"to scheduler %s: %s", sched['name'], err)
raise
finally:
if send_ok:
results.clear()
else: # if - and only if - send was not ok,
# then "de-init" the sched connection:
sched['con'] = None
[docs] def get_return_for_passive(self, sched_id):
"""Get returns of passive actions for a specific scheduler
:param sched_id: scheduler id
:type sched_id: int
:return: Action list
:rtype: list
"""
# I do not know this scheduler?
sched = self.schedulers.get(sched_id)
if sched is None:
logger.debug("I do not know this scheduler: %s", sched_id)
return []
ret, sched['wait_homerun'] = sched['wait_homerun'], {}
logger.debug("Preparing to return %s results", len(ret))
return ret.values()
[docs] def create_and_launch_worker(self, module_name='fork', mortal=True,
__warned=set()):
"""Create and launch a new worker, and put it into self.workers
It can be mortal or not
:param module_name: the module name related to the worker
default is "fork" for no module
:type module_name: str
:param mortal: make the Worker mortal or not. Default True
:type mortal: bool
:param __warned: Remember the module we warned about.
This param is a tuple and as it is only init once (the default value)
we use this python behavior that make this set grows with module_name
not found on previous call
:type __warned: set
:return: None
"""
# create the input queue of this worker
try:
if IS_ANDROID:
queue = Queue()
else:
queue = self.manager.Queue()
# If we got no /dev/shm on linux-based system, we can got problem here.
# Must raise with a good message
except OSError, exp:
# We look for the "Function not implemented" under Linux
if exp.errno == 38 and os.name == 'posix':
logger.critical("Got an exception (%s). If you are under Linux, "
"please check that your /dev/shm directory exists and"
" is read-write.", str(exp))
raise
# If we are in the fork module, we do not specify a target
target = None
if module_name == 'fork':
target = None
else:
for module in self.modules_manager.instances:
if module.properties['type'] == module_name:
# First, see if the module is a 'worker' one or not
if not module.properties.get('worker_capable', False):
raise NotWorkerMod
target = module.work
if target is None:
if module_name not in __warned:
logger.warning("No target found for %s, NOT creating a worker for it..",
module_name)
__warned.add(module_name)
return
# We want to give to the Worker the name of the daemon (poller or reactionner)
cls_name = self.__class__.__name__.lower()
worker = Worker(1, queue, self.returns_queue, self.processes_by_worker,
mortal=mortal, max_plugins_output_length=self.max_plugins_output_length,
target=target, loaded_into=cls_name, http_daemon=self.http_daemon)
worker.module_name = module_name
# save this worker
self.workers[worker._id] = worker
# And save the Queue of this worker, with key = worker id
self.q_by_mod[module_name][worker._id] = queue
logger.info("[%s] Allocating new %s Worker: %s", self.name, module_name, worker._id)
# Ok, all is good. Start it!
worker.start()
[docs] def do_stop(self):
"""Stop all workers modules and sockets
:return: None
"""
logger.info("[%s] Stopping all workers", self.name)
for worker in self.workers.values():
try:
worker.terminate()
worker.join(timeout=1)
# A already dead worker or in a worker
except (AttributeError, AssertionError):
pass
super(Satellite, self).do_stop()
[docs] def add(self, elt):
"""Add an object to the satellite one
Handles brok and externalcommand
:param elt: object to add
:type elt: object
:return: None
"""
cls_type = elt.__class__.my_type
if cls_type == 'brok':
# For brok, we TAG brok with our instance_id
elt.instance_id = 0
self.broks[elt._id] = elt
return
elif cls_type == 'externalcommand':
logger.debug("Enqueuing an external command '%s'", str(elt.__dict__))
with self.external_commands_lock:
self.external_commands.append(elt)
[docs] def get_broks(self):
"""Get brok list from satellite
:return: A copy of the Brok list
:rtype: list
"""
res = copy.copy(self.broks)
self.broks.clear()
return res
[docs] def check_and_del_zombie_workers(self):
"""Check if worker are fine and kill them if not.
Dispatch the actions in the worker to another one
:return: None
"""
# In android, we are using threads, so there is not active_children call
if not IS_ANDROID:
# Active children make a join with everyone, useful :)
active_children()
w_to_del = []
for worker in self.workers.values():
# If a worker goes down and we did not ask him, it's not
# good: we can think that we have a worker and it's not True
# So we del it
if not worker.is_alive():
logger.warning("[%s] The worker %s goes down unexpectedly!", self.name, worker._id)
# Terminate immediately
worker.terminate()
worker.join(timeout=1)
w_to_del.append(worker._id)
# OK, now really del workers from queues
# And requeue the actions it was managed
for w_id in w_to_del:
worker = self.workers[w_id]
# Del the queue of the module queue
del self.q_by_mod[worker.module_name][worker._id]
for sched_id in self.schedulers:
sched = self.schedulers[sched_id]
for act in sched['actions'].values():
if act.status == 'queue' and act.worker_id == w_id:
# Got a check that will NEVER return if we do not
# restart it
self.assign_to_a_queue(act)
# So now we can really forgot it
del self.workers[w_id]
[docs] def adjust_worker_number_by_load(self):
"""Try to create the minimum workers specified in the configuration
:return: None
"""
to_del = []
logger.debug("[%s] Trying to adjust worker number."
" Actual number : %d, min per module : %d, max per module : %d",
self.name, len(self.workers), self.min_workers, self.max_workers)
# I want at least min_workers by module then if I can, I add worker for load balancing
for mod in self.q_by_mod:
# At least min_workers
todo = max(0, self.min_workers - len(self.q_by_mod[mod]))
for _ in range(todo):
try:
self.create_and_launch_worker(module_name=mod)
# Maybe this modules is not a true worker one.
# if so, just delete if from q_by_mod
except NotWorkerMod:
to_del.append(mod)
break
for mod in to_del:
logger.debug("[%s] The module %s is not a worker one, "
"I remove it from the worker list", self.name, mod)
del self.q_by_mod[mod]
# TODO: if len(workers) > 2*wish, maybe we can kill a worker?
def _got_queue_from_action(self, action):
"""Find action queue for the action depending on the module.
The id is found with action modulo on action id
:param a: the action that need action queue to be assigned
:type action: object
:return: worker id and queue. (0, None) if no queue for the module_type
:rtype: tuple
"""
# get the module name, if not, take fork
mod = getattr(action, 'module_type', 'fork')
queues = self.q_by_mod[mod].items()
# Maybe there is no more queue, it's very bad!
if len(queues) == 0:
return (0, None)
# if not get action round robin index to get action queue based
# on the action id
rr_idx = action._id % len(queues)
(index, queue) = queues[rr_idx]
# return the id of the worker (i), and its queue
return (index, queue)
[docs] def add_actions(self, lst, sched_id):
"""Add a list of actions to the satellite queues
:param lst: Action list
:type lst: list
:param sched_id: sheduler id to assign to
:type sched_id: int
:return: None
"""
for act in lst:
# First we look if we do not already have it, if so
# do nothing, we are already working!
if act._id in self.schedulers[sched_id]['actions']:
continue
act.sched_id = sched_id
act.status = 'queue'
self.assign_to_a_queue(act)
[docs] def assign_to_a_queue(self, action):
"""Take an action and put it to action queue
:param action: action to put
:type action: alignak.action.Action
:return: None
"""
msg = Message(_id=0, _type='Do', data=action)
(index, queue) = self._got_queue_from_action(action)
# Tag the action as "in the worker i"
action.worker_id = index
if queue is not None:
queue.put(msg)
[docs] def get_new_actions(self):
""" Wrapper function for do_get_new_actions
For stats purpose
:return: None
TODO: Use a decorator
"""
_t0 = time.time()
self.do_get_new_actions()
statsmgr.incr('core.get-new-actions', time.time() - _t0)
[docs] def do_get_new_actions(self):
"""Get new actions from schedulers
Create a Message and put into the module queue
REF: doc/alignak-action-queues.png (1)
:return: None
"""
# Here are the differences between a
# poller and a reactionner:
# Poller will only do checks,
# reactionner do actions (notif + event handlers)
do_checks = self.__class__.do_checks
do_actions = self.__class__.do_actions
# We check for new check in each schedulers and put the result in new_checks
for sched_id in self.schedulers:
sched = self.schedulers[sched_id]
# If sched is not active, I do not try return
if not sched['active']:
continue
try:
try:
con = sched['con']
except KeyError:
con = None
if con is not None: # None = not initialized
# OK, go for it :)
# Before ask a call that can be long, do a simple ping to be sure it is alive
con.get('ping')
tmp = con.get('get_checks', {
'do_checks': do_checks, 'do_actions': do_actions,
'poller_tags': self.poller_tags,
'reactionner_tags': self.reactionner_tags,
'worker_name': self.name,
'module_types': self.q_by_mod.keys()
},
wait='long')
# Explicit pickle load
tmp = base64.b64decode(tmp)
tmp = zlib.decompress(tmp)
tmp = cPickle.loads(str(tmp))
logger.debug("Ask actions to %d, got %d", sched_id, len(tmp))
# We 'tag' them with sched_id and put into queue for workers
# REF: doc/alignak-action-queues.png (2)
self.add_actions(tmp, sched_id)
else: # no con? make the connection
self.pynag_con_init(sched_id)
# Ok, con is unknown, so we create it
# Or maybe is the connection lost, we recreate it
except (HTTPEXCEPTIONS, KeyError), exp:
logger.debug('get_new_actions exception:: %s,%s ', type(exp), str(exp))
self.pynag_con_init(sched_id)
# scheduler must not be initialized
# or scheduler must not have checks
except AttributeError, exp:
logger.debug('get_new_actions exception:: %s,%s ', type(exp), str(exp))
# What the F**k? We do not know what happened,
# log the error message if possible.
except Exception, exp:
logger.error("A satellite raised an unknown exception: %s (%s)", exp, type(exp))
raise
[docs] def get_returns_queue_len(self):
"""Wrapper for returns_queue.qsize method. Return queue length
:return: queue length
:rtype: int
"""
return self.returns_queue.qsize()
[docs] def get_returns_queue_item(self):
"""Wrapper for returns_queue.get method. Return an queue element
:return: queue Message
:rtype: alignak.message.Message
"""
return self.returns_queue.get()
[docs] def clean_previous_run(self):
"""Clean variables from previous configuration,
such as schedulers, broks and external commands
:return: None
"""
# Clean all lists
self.schedulers.clear()
self.broks.clear()
with self.external_commands_lock:
self.external_commands = self.external_commands[:]
[docs] def do_loop_turn(self):
"""Satellite main loop::
* Setup new conf if necessary
* Watch for new conf
* Check and delete zombies actions / modules
* Get returns from queues
* Adjust worker number
* Get new actions
:return: None
"""
logger.debug("Loop turn")
# Maybe the arbiter ask us to wait for a new conf
# If true, we must restart all...
if self.cur_conf is None:
# Clean previous run from useless objects
# and close modules
self.clean_previous_run()
self.wait_for_initial_conf()
# we may have been interrupted or so; then
# just return from this loop turn
if not self.new_conf:
return
self.setup_new_conf()
# Now we check if arbiter speak to us.
# If so, we listen to it
# When it push a conf, we reinit connections
# Sleep in waiting a new conf :)
# TODO: manage the diff again.
while self.timeout > 0:
begin = time.time()
self.watch_for_new_conf(self.timeout)
end = time.time()
if self.new_conf:
self.setup_new_conf()
self.timeout = self.timeout - (end - begin)
logger.debug(" ======================== ")
self.timeout = self.polling_interval
# Check if zombies workers are among us :)
# If so: KILL THEM ALL!!!
self.check_and_del_zombie_workers()
# But also modules
self.check_and_del_zombie_modules()
# Print stats for debug
for sched_id in self.schedulers:
sched = self.schedulers[sched_id]
for mod in self.q_by_mod:
# In workers we've got actions send to queue - queue size
for (index, queue) in self.q_by_mod[mod].items():
logger.debug("[%d][%s][%s] Stats: Workers:%d (Queued:%d TotalReturnWait:%d)",
sched_id, sched['name'], mod,
index, queue.qsize(), self.get_returns_queue_len())
# also update the stats module
statsmgr.incr('core.worker-%s.queue-size' % mod, queue.qsize())
# Before return or get new actions, see how we manage
# old ones: are they still in queue (s)? If True, we
# must wait more or at least have more workers
wait_ratio = self.wait_ratio.get_load()
total_q = 0
for mod in self.q_by_mod:
for queue in self.q_by_mod[mod].values():
total_q += queue.qsize()
if total_q != 0 and wait_ratio < 2 * self.polling_interval:
logger.debug("I decide to up wait ratio")
self.wait_ratio.update_load(wait_ratio * 2)
# self.wait_ratio.update_load(self.polling_interval)
else:
# Go to self.polling_interval on normal run, if wait_ratio
# was >2*self.polling_interval,
# it make it come near 2 because if < 2, go up :)
self.wait_ratio.update_load(self.polling_interval)
wait_ratio = self.wait_ratio.get_load()
logger.debug("Wait ratio: %f", wait_ratio)
statsmgr.incr('core.wait-ratio', wait_ratio)
# We can wait more than 1s if needed,
# no more than 5s, but no less than 1
timeout = self.timeout * wait_ratio
timeout = max(self.polling_interval, timeout)
self.timeout = min(5 * self.polling_interval, timeout)
statsmgr.incr('core.timeout', wait_ratio)
# Maybe we do not have enough workers, we check for it
# and launch the new ones if needed
self.adjust_worker_number_by_load()
# Manage all messages we've got in the last timeout
# for queue in self.return_messages:
while self.get_returns_queue_len() != 0:
self.manage_action_return(self.get_returns_queue_item())
# If we are passive, we do not initiate the check getting
# and return
if not self.passive:
# Now we can get new actions from schedulers
self.get_new_actions()
# We send all finished checks
# REF: doc/alignak-action-queues.png (6)
self.manage_returns()
# Get objects from our modules that are not worker based
self.get_objects_from_from_queues()
# Say to modules it's a new tick :)
self.hook_point('tick')
[docs] def do_post_daemon_init(self):
"""Do this satellite (poller or reactionner) post "daemonize" init
:return: None
"""
# self.s = Queue() # Global Master -> Slave
# We can open the Queue for fork AFTER
self.q_by_mod['fork'] = {}
# Under Android, we do not have multiprocessing lib
# so use standard Queue threads things
# but in multiprocess, we are also using a Queue(). It's just
# not the same
if IS_ANDROID:
self.returns_queue = Queue()
else:
self.returns_queue = self.manager.Queue()
# For multiprocess things, we should not have
# socket timeouts.
import socket
socket.setdefaulttimeout(None)
[docs] def setup_new_conf(self):
"""Setup new conf received from Arbiter
:return: None
"""
with self.conf_lock:
conf = self.new_conf
logger.debug("[%s] Sending us a configuration %s", self.name, conf)
self.new_conf = None
self.cur_conf = conf
g_conf = conf['global']
# Got our name from the globals
if 'poller_name' in g_conf:
name = g_conf['poller_name']
elif 'reactionner_name' in g_conf:
name = g_conf['reactionner_name']
else:
name = 'Unnamed satellite'
self.name = name
# kernel.io part
self.api_key = g_conf['api_key']
self.secret = g_conf['secret']
self.http_proxy = g_conf['http_proxy']
# local statsd
self.statsd_host = g_conf['statsd_host']
self.statsd_port = g_conf['statsd_port']
self.statsd_prefix = g_conf['statsd_prefix']
self.statsd_enabled = g_conf['statsd_enabled']
# we got a name, we can now say it to our statsmgr
if 'poller_name' in g_conf:
statsmgr.register(self, self.name, 'poller',
api_key=self.api_key, secret=self.secret,
http_proxy=self.http_proxy,
statsd_host=self.statsd_host, statsd_port=self.statsd_port,
statsd_prefix=self.statsd_prefix,
statsd_enabled=self.statsd_enabled)
else:
statsmgr.register(self, self.name, 'reactionner',
api_key=self.api_key, secret=self.secret,
statsd_host=self.statsd_host, statsd_port=self.statsd_port,
statsd_prefix=self.statsd_prefix,
statsd_enabled=self.statsd_enabled)
self.passive = g_conf['passive']
if self.passive:
logger.info("[%s] Passive mode enabled.", self.name)
# If we've got something in the schedulers, we do not want it anymore
for sched_id in conf['schedulers']:
already_got = False
# We can already got this conf id, but with another address
if sched_id in self.schedulers:
new_addr = conf['schedulers'][sched_id]['address']
old_addr = self.schedulers[sched_id]['address']
new_port = conf['schedulers'][sched_id]['port']
old_port = self.schedulers[sched_id]['port']
# Should got all the same to be ok :)
if new_addr == old_addr and new_port == old_port:
already_got = True
if already_got:
logger.info("[%s] We already got the conf %d (%s)",
self.name, sched_id, conf['schedulers'][sched_id]['name'])
wait_homerun = self.schedulers[sched_id]['wait_homerun']
actions = self.schedulers[sched_id]['actions']
sched = conf['schedulers'][sched_id]
self.schedulers[sched_id] = sched
if sched['name'] in g_conf['satellitemap']:
sched.update(g_conf['satellitemap'][sched['name']])
proto = 'http'
if sched['use_ssl']:
proto = 'https'
uri = '%s://%s:%s/' % (proto, sched['address'], sched['port'])
self.schedulers[sched_id]['uri'] = uri
if already_got:
self.schedulers[sched_id]['wait_homerun'] = wait_homerun
self.schedulers[sched_id]['actions'] = actions
else:
self.schedulers[sched_id]['wait_homerun'] = {}
self.schedulers[sched_id]['actions'] = {}
self.schedulers[sched_id]['running_id'] = 0
self.schedulers[sched_id]['active'] = sched['active']
self.schedulers[sched_id]['timeout'] = sched['timeout']
self.schedulers[sched_id]['data_timeout'] = sched['data_timeout']
# Do not connect if we are a passive satellite
if not self.passive and not already_got:
# And then we connect to it :)
self.pynag_con_init(sched_id)
# Now the limit part, 0 mean: number of cpu of this machine :)
# if not available, use 4 (modern hardware)
self.max_workers = g_conf['max_workers']
if self.max_workers == 0 and not IS_ANDROID:
try:
self.max_workers = cpu_count()
except NotImplementedError:
self.max_workers = 4
logger.info("[%s] Using max workers: %s", self.name, self.max_workers)
self.min_workers = g_conf['min_workers']
if self.min_workers == 0 and not IS_ANDROID:
try:
self.min_workers = cpu_count()
except NotImplementedError:
self.min_workers = 4
logger.info("[%s] Using min workers: %s", self.name, self.min_workers)
self.processes_by_worker = g_conf['processes_by_worker']
self.polling_interval = g_conf['polling_interval']
self.timeout = self.polling_interval
# Now set tags
# ['None'] is the default tags
self.poller_tags = g_conf.get('poller_tags', ['None'])
self.reactionner_tags = g_conf.get('reactionner_tags', ['None'])
self.max_plugins_output_length = g_conf.get('max_plugins_output_length', 8192)
# Set our giving timezone from arbiter
use_timezone = g_conf['use_timezone']
if use_timezone != 'NOTSET':
logger.info("[%s] Setting our timezone to %s", self.name, use_timezone)
os.environ['TZ'] = use_timezone
time.tzset()
logger.info("We have our schedulers: %s", str(self.schedulers))
# Now manage modules
# TODO: check how to better handle this with modules_manager..
mods = g_conf['modules']
for module in mods:
# If we already got it, bypass
if module.module_type not in self.q_by_mod:
logger.debug("Add module object %s", str(module))
self.modules_manager.modules.append(module)
logger.info("[%s] Got module: %s ", self.name, module.module_type)
self.q_by_mod[module.module_type] = {}
[docs] def get_stats_struct(self):
"""Get state of modules and create a scheme for stats data of daemon
This may be overridden in subclasses
:return: A dict with the following structure
::
{ 'metrics': ['%s.%s.external-commands.queue %d %d'],
'version': __version__,
'name': self.name,
'type': _type,
'passive': self.passive,
'modules':
{'internal': {'name': "MYMODULE1", 'state': 'ok'},
{'external': {'name': "MYMODULE2", 'state': 'stopped'},
]
}
:rtype: dict
"""
now = int(time.time())
# call the daemon one
res = super(Satellite, self).get_stats_struct()
_type = self.__class__.my_type
res.update({'name': self.name, 'type': _type})
# The receiver do nto have a passie prop
if hasattr(self, 'passive'):
res['passive'] = self.passive
metrics = res['metrics']
# metrics specific
metrics.append('%s.%s.external-commands.queue %d %d' % (
_type, self.name, len(self.external_commands), now))
return res
[docs] def main(self):
"""Main satellite function. Do init and then mainloop
:return: None
"""
try:
for line in self.get_header():
logger.info(line)
self.load_config_file()
# Setting log level
logger.setLevel(self.log_level)
# Force the debug level if the daemon is said to start with such level
if self.debug:
logger.setLevel('DEBUG')
# Look if we are enabled or not. If ok, start the daemon mode
self.look_for_early_exit()
self.do_daemon_init_and_start()
self.do_post_daemon_init()
self.load_modules_manager()
# We wait for initial conf
self.wait_for_initial_conf()
if not self.new_conf: # we must have either big problem or was requested to shutdown
return
self.setup_new_conf()
# We can load our modules now
self.modules_manager.set_modules(self.modules_manager.modules)
self.do_load_modules()
# And even start external ones
self.modules_manager.start_external_instances()
# Allocate Mortal Threads
for _ in xrange(1, self.min_workers):
to_del = []
for mod in self.q_by_mod:
try:
self.create_and_launch_worker(module_name=mod)
# Maybe this modules is not a true worker one.
# if so, just delete if from q_by_mod
except NotWorkerMod:
to_del.append(mod)
for mod in to_del:
logger.debug("The module %s is not a worker one, "
"I remove it from the worker list", mod)
del self.q_by_mod[mod]
# Now main loop
self.do_mainloop()
except Exception:
self.print_unrecoverable(traceback.format_exc())
raise