Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev: ui_resource: Give warning start/stop/restart if "is-managed" or "maintenance" get detected #859

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .github/workflows/crmsh-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,16 @@ jobs:
$DOCKER_SCRIPT resource before_install
$DOCKER_SCRIPT resource run
functional_test_operation:
runs-on: ubuntu-latest
timeout-minutes: 20
steps:
- uses: actions/checkout@v2
- name: functional test for operations
run: |
$DOCKER_SCRIPT operation before_install
$DOCKER_SCRIPT operation run
functional_test_configure_sublevel:
runs-on: ubuntu-latest
timeout-minutes: 20
Expand Down
4 changes: 2 additions & 2 deletions crmsh/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -2201,7 +2201,7 @@ def bootstrap_remove(context):
remove_self()
return

if _context.cluster_node in xmlutil.listnodes():
if _context.cluster_node in xmlutil.NodeState().list_nodes():
remove_node_from_cluster()
else:
error("Specified node {} is not configured in cluster! Unable to remove.".format(_context.cluster_node))
Expand All @@ -2210,7 +2210,7 @@ def bootstrap_remove(context):
def remove_self():
me = _context.cluster_node
yes_to_all = _context.yes_to_all
nodes = xmlutil.listnodes(include_remote_nodes=False)
nodes = xmlutil.NodeState().list_nodes(include_remote_nodes=False)
othernode = next((x for x in nodes if x != me), None)
if othernode is not None:
# remove from other node
Expand Down
2 changes: 1 addition & 1 deletion crmsh/completers.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def primitives(args):
return [x.get("id") for x in nodes if xmlutil.is_primitive(x)]


nodes = call(xmlutil.listnodes)
nodes = call(xmlutil.NodeState().list_nodes)

shadows = call(xmlutil.listshadows)

Expand Down
2 changes: 2 additions & 0 deletions crmsh/ui_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,8 @@ def do_standby(self, context, *args):
else:
syntax_err(args, context=context.get_command_name())
return False
if xmlutil.NodeState().is_node_in_maintenance(node):
context.warning("Node \"{}\" is in maintenance".format(node))
opts = ''
if lifetime:
opts = "--lifetime='%s'" % lifetime
Expand Down
3 changes: 3 additions & 0 deletions crmsh/ui_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,9 @@ def _commit_meta_attrs(self, context, resources, name, value):

rc = True
for rsc in resources:
rc_managed, reason = xmlutil.RscState().is_managed(rsc)
if not rc_managed:
context.warning("Resource {} is unmanaged ({})".format(rsc, reason))
rc = rc and set_deep_meta_attr(rsc, name, value, commit=False)
if commit and rc:
ok = cib_factory.commit()
Expand Down
141 changes: 96 additions & 45 deletions crmsh/xmlutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from . import userdir
from .utils import add_sudo, str2file, str2tmp, get_boolean
from .utils import get_stdout, stdout2list, crm_msec, crm_time_cmp
from .utils import olist, get_cib_in_use, get_tempdir, to_ascii
from .utils import olist, get_cib_in_use, get_tempdir, to_ascii, running_on


def xmlparse(f):
Expand Down Expand Up @@ -173,7 +173,71 @@ def get_top_cib_nodes(node, nodes_l):
return nodes_l


class RscState(object):
class CibConfiguration(object):
"""
Base class to load current cib configuration
"""
def __init__(self):
self.current_cib = None

def _init_cib(self):
self.current_cib = cibdump2elem()
if self.current_cib is None:
raise ValueError("Cannot dump cib configuration")


class NodeState(CibConfiguration):
"""
Class to get specific node state
"""
def list_nodes(self, include_remote_nodes=True):
"""
List current nodes in cib
"""
if self.current_cib is None:
self._init_cib()
local_nodes = self.current_cib.xpath('configuration/nodes/node/@uname')
if include_remote_nodes:
remote_nodes = self.current_cib.xpath('status/node_state[@remote_node="true"]/@uname')
else:
remote_nodes = []
return list(set([n for n in local_nodes + remote_nodes if n]))

def get_specific_node(self, uname):
"""
Get a node XML element given the uname.
"""
if self.current_cib is None:
self._init_cib()
if uname not in self.list_nodes():
raise ValueError("Node \"{}\" not exist".format(uname))
return self.current_cib.xpath("configuration//*[@uname=\"{}\"]".format(uname))[0]

def is_node_in_maintenance(self, uname):
"""
Check if a node is in maintenance
"""
node_entry = self.get_specific_node(uname)
attr_entry = get_child_nvset_node(node_entry, attr_set="instance_attributes")
if attr_entry is None:
return False
attr = get_attr_value(attr_entry, "maintenance")
return is_xs_boolean_true(attr) if attr else False

def are_all_nodes_in_maintenance(self):
"""
Check if all nodes are in maintenance
"""
return all([self.is_node_in_maintenance(node) for node in self.list_nodes()])

def is_node_in_maintenance_for_the_running_resource(self, rsc_id):
"""
Check if node running this resource is in maintenance
"""
return any([self.is_node_in_maintenance(node) for node in running_on(rsc_id)])


class RscState(CibConfiguration):
'''
Get the resource status and some other relevant bits.
In particular, this class should allow for a bit of caching
Expand All @@ -184,17 +248,17 @@ class RscState(object):
rsc_status = "crm_resource -W -r '%s'"

def __init__(self):
self.current_cib = None
super(self.__class__, self).__init__()
self.rsc_elem = None
self.prop_elem = None
self.rsc_dflt_elem = None

def _init_cib(self):
cib = cibdump2elem("configuration")
self.current_cib = cib
self.rsc_elem = get_first_conf_elem(cib, "resources")
self.prop_elem = get_first_conf_elem(cib, "crm_config/cluster_property_set")
self.rsc_dflt_elem = get_first_conf_elem(cib, "rsc_defaults/meta_attributes")
def _load_cib(self):
if self.current_cib is None:
self._init_cib()
self.rsc_elem = get_first_conf_elem(self.current_cib, "resources")
self.prop_elem = get_first_conf_elem(self.current_cib, "crm_config/cluster_property_set")
self.rsc_dflt_elem = get_first_conf_elem(self.current_cib, "rsc_defaults/meta_attributes")

def rsc2node(self, ident):
'''
Expand All @@ -204,23 +268,21 @@ def rsc2node(self, ident):
expensive.
'''
if self.rsc_elem is None:
self._init_cib()
self._load_cib()
if self.rsc_elem is None:
return None
raise ValueError("Failed to load resources cib")
# does this need to be optimized?
expr = './/*[@id="%s"]' % ident
try:
return self.rsc_elem.xpath(expr)[0]
except (IndexError, AttributeError):
return None
raise ValueError("Cannot find resource \"{}\"".format(ident))

def is_ms(self, ident):
'''
Test if the resource is master-slave.
'''
rsc_node = self.rsc2node(ident)
if rsc_node is None:
return False
return is_ms(rsc_node)

def rsc_clone(self, ident):
Expand All @@ -229,8 +291,6 @@ def rsc_clone(self, ident):
or None if it's not cloned.
'''
rsc_node = self.rsc2node(ident)
if rsc_node is None:
return None
pnode = rsc_node.getparent()
if pnode is None:
return None
Expand All @@ -243,28 +303,33 @@ def rsc_clone(self, ident):
def is_managed(self, ident):
'''
Is this resource managed?
Return (boolean, reason)
'''
rsc_node = self.rsc2node(ident)
if rsc_node is None:
return False
# maintenance-mode, if true, overrides all
attr = get_attr_value(self.prop_elem, "maintenance-mode")
if attr and is_xs_boolean_true(attr):
return False
# then check the rsc is-managed meta attribute
return False, "cluster property maintenance-mode is true"
# then check if all nodes are in maintenance
if NodeState().are_all_nodes_in_maintenance():
return False, "all nodes are in maintenance"
# then check if node running this resource is in maintenance
if NodeState().is_node_in_maintenance_for_the_running_resource(ident):
return False, "node which running \"{}\" is in maintenance".format(ident)
rsc_meta_node = get_rsc_meta_node(rsc_node)
# then check the rsc maintenance meta attribute
attr = get_attr_value(rsc_meta_node, "maintenance")
if attr and is_xs_boolean_true(attr):
return False, "resource \"{}\" is in maintenance".format(ident)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The above "maintenance" checks are not the scope of is_managed(). Since, "maintenance" and "is-managed" are two different terminologies and not fit into the current function name itself all together. I'm thinking their function names something like,

is_managed(rsc)
is_maintenance(rsc)
is_managed_or_maintenance(rsc)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From display of crm_mon, pacemaker show both situations as unmanaged for the resource
How about change the function name as is_managed_or_maintenance?

# then check the rsc is-managed meta attribute
attr = get_attr_value(rsc_meta_node, "is-managed")
if attr:
return is_xs_boolean_true(attr)
if attr and not is_xs_boolean_true(attr):
return False, "resource \"{}\" meta_attributes is-managed is false".format(ident)
# then rsc_defaults is-managed attribute
attr = get_attr_value(self.rsc_dflt_elem, "is-managed")
if attr:
return is_xs_boolean_true(attr)
# finally the is-managed-default property
attr = get_attr_value(self.prop_elem, "is-managed-default")
if attr:
return is_xs_boolean_true(attr)
return True
if attr and not is_xs_boolean_true(attr):
return False, "resource defaults meta_attributes is-managed is false"
return True, None

def is_running(self, ident):
'''
Expand All @@ -281,16 +346,14 @@ def is_group(self, ident):
Test if the resource is a group
'''
rsc_node = self.rsc2node(ident)
if rsc_node is None:
return False
return is_group(rsc_node)

def can_delete(self, ident):
'''
Can a resource be deleted?
The order below is important!
'''
return not (self.is_running(ident) and not self.is_group(ident) and self.is_managed(ident))
return not (self.is_running(ident) and not self.is_group(ident) and self.is_managed(ident)[0])


def resources_xml():
Expand Down Expand Up @@ -334,26 +397,14 @@ def mk_rsc_type(n):
return ''.join((s1, s2, ra_type))


def listnodes(include_remote_nodes=True):
cib = cibdump2elem()
if cib is None:
return []
local_nodes = cib.xpath('/cib/configuration/nodes/node/@uname')
if include_remote_nodes:
remote_nodes = cib.xpath('/cib/status/node_state[@remote_node="true"]/@uname')
else:
remote_nodes = []
return list(set([n for n in local_nodes + remote_nodes if n]))


def is_our_node(s):
'''
Check if s is in a list of our nodes (ignore case).
This is not fast, perhaps should be cached.

Includes remote nodes as well
'''
for n in listnodes():
for n in NodeState().list_nodes():
if n.lower() == s.lower():
return True
return False
Expand Down
3 changes: 2 additions & 1 deletion data-manifest
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ test/features/environment.py
test/features/geo_setup.feature
test/features/hb_report_bugs.feature
test/features/ocfs2.feature
test/features/operation_maintenance.feature
test/features/qdevice_options.feature
test/features/qdevice_setup_remove.feature
test/features/qdevice_usercase.feature
Expand Down Expand Up @@ -194,8 +195,8 @@ test/unittests/test_objset.py
test/unittests/test_ocfs2.py
test/unittests/test_parallax.py
test/unittests/test_parse.py
test/unittests/test_ratrace.py
test/unittests/test_qdevice.py
test/unittests/test_ratrace.py
test/unittests/test_report.py
test/unittests/test_sbd.py
test/unittests/test_scripts.py
Expand Down
57 changes: 57 additions & 0 deletions test/features/operation_maintenance.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
@operation
Feature: Test cluster/node/resources maintenance

Tag @clean means need to stop cluster service if the service is available

Background: Setup one node cluster and configure some resources
Given Cluster service is "stopped" on "hanode1"
Given Cluster service is "stopped" on "hanode2"
When Run "crm cluster init -y" on "hanode1"
Then Cluster service is "started" on "hanode1"
When Run "crm cluster join -c hanode1 -y" on "hanode2"
Then Cluster service is "started" on "hanode2"
When Run "crm configure primitive d Dummy op monitor interval=3s" on "hanode1"
Then Resource "d" type "Dummy" is "Started"

@clean
Scenario: Give error when start/stop resources while cluster in maintenance
When Run "crm maintenance on" on "hanode1"
And Try "crm resource stop d" on "hanode1"
Then Except "ERROR: resource.stop: Resource d is unmanaged" in stderr
Then Resource "d" type "Dummy" is "Started"
When Run "crm maintenance off" on "hanode1"
When Run "crm resource stop d" on "hanode1"
Then Resource "d" type "Dummy" is "Stopped"

@clean
Scenario: Give error when start/stop resources while all nodes in maintenance
When Run "crm node maintenance hanode1" on "hanode1"
When Run "crm node maintenance hanode2" on "hanode2"
And Try "crm resource stop d" on "hanode1"
Then Except "ERROR: resource.stop: Resource d is unmanaged" in stderr
Then Resource "d" type "Dummy" is "Started"
When Run "crm node ready hanode1" on "hanode1"
When Run "crm node ready hanode2" on "hanode2"
When Run "crm resource stop d" on "hanode1"
Then Resource "d" type "Dummy" is "Stopped"

@clean
Scenario: Give error when start/stop resources while node running this RA in maintenance
When Run "crm configure location loc1 d 100: hanode1" on "hanode1"
And Run "crm node maintenance hanode1" on "hanode1"
And Try "crm resource stop d" on "hanode1"
Then Except "ERROR: resource.stop: Resource d is unmanaged" in stderr
Then Resource "d" type "Dummy" is "Started"
When Run "crm node ready hanode1" on "hanode1"
When Run "crm resource stop d" on "hanode1"
Then Resource "d" type "Dummy" is "Stopped"

@clean
Scenario: Give error when start/stop resources while this RA in maintenance
When Run "crm resource maintenance d on" on "hanode1"
And Try "crm resource stop d" on "hanode1"
Then Except "ERROR: resource.stop: Resource d is unmanaged" in stderr
Then Resource "d" type "Dummy" is "Started"
When Run "crm resource maintenance d off" on "hanode1"
When Run "crm resource stop d" on "hanode1"
Then Resource "d" type "Dummy" is "Stopped"
2 changes: 1 addition & 1 deletion test/run-in-travis.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ case "$1" in
configure
make_install
exit $?;;
bootstrap|qdevice|hb_report|resource|geo|configure|constraints|ocfs2)
bootstrap|qdevice|hb_report|resource|geo|configure|constraints|ocfs2|operation)
functional_tests $1 $2
exit $?;;
*|original)
Expand Down
Loading