diff --git a/.github/workflows/crmsh-ci.yml b/.github/workflows/crmsh-ci.yml index 0bfe67ce8..6dc6b8978 100644 --- a/.github/workflows/crmsh-ci.yml +++ b/.github/workflows/crmsh-ci.yml @@ -415,6 +415,21 @@ jobs: token: ${{ secrets.CODECOV_TOKEN }} flags: integration + functional_test_migration: + runs-on: ubuntu-24.04 + timeout-minutes: 40 + steps: + - uses: actions/checkout@v4 + - name: functional test for migration + run: | + echo '{ "exec-opts": ["native.cgroupdriver=systemd"] }' | sudo tee /etc/docker/daemon.json + sudo systemctl restart docker.service + $CONTAINER_SCRIPT `$GET_INDEX_OF migration` && $CONTAINER_SCRIPT -d && $CONTAINER_SCRIPT -u `$GET_INDEX_OF migration` + - uses: codecov/codecov-action@v4 + with: + token: ${{ secrets.CODECOV_TOKEN }} + flags: integration + original_regression_test: runs-on: ubuntu-24.04 timeout-minutes: 40 diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 000000000..05a3e1ed6 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +recursive-include crmsh *.txt diff --git a/codecov.yml b/codecov.yml index 47000a652..9c67c4c4a 100644 --- a/codecov.yml +++ b/codecov.yml @@ -8,7 +8,7 @@ coverage: threshold: 0.35% codecov: notify: - after_n_builds: 28 + after_n_builds: 29 comment: - after_n_builds: 28 + after_n_builds: 29 layout: "condensed_header, flags, files, condensed_footer" diff --git a/crmsh/cibquery.py b/crmsh/cibquery.py new file mode 100644 index 000000000..0683ba952 --- /dev/null +++ b/crmsh/cibquery.py @@ -0,0 +1,26 @@ +"""utilities for parsing CIB xml""" +import dataclasses +import typing + +import lxml.etree + + +@dataclasses.dataclass(frozen=True) +class ResourceAgent: + m_class: str + m_provider: typing.Optional[str] + m_type: str + + +def get_configured_resource_agents(cib: lxml.etree.Element) -> typing.Set[ResourceAgent]: + return set( + ResourceAgent(e.get('class'), e.get('provider'), e.get('type')) + for e in cib.xpath('/cib/configuration/resources//primitive') + ) + + +def has_primitive_filesystem_with_fstype(cib: lxml.etree.Element, fstype: str) -> bool: + return bool(cib.xpath( + '/cib/configuration/resources//primitive[@class="ocf" and @provider="heartbeat" and @type="Filesystem"]' + f'/instance_attributes/nvpair[@name="fstype" and @value="{fstype}"]' + )) diff --git a/crmsh/corosync.py b/crmsh/corosync.py index e2919f7c4..d73b26ec8 100644 --- a/crmsh/corosync.py +++ b/crmsh/corosync.py @@ -85,7 +85,7 @@ def configure_two_node(removing: bool = False, qdevice_adding: bool = False) -> def conf(): - return os.getenv('COROSYNC_MAIN_CONFIG_FILE', '/etc/corosync/corosync.conf') + return os.environ.get('COROSYNC_MAIN_CONFIG_FILE', '/etc/corosync/corosync.conf') def check_tools(): diff --git a/crmsh/migration-supported-resource-agents.txt b/crmsh/migration-supported-resource-agents.txt new file mode 100644 index 000000000..af24a367f --- /dev/null +++ b/crmsh/migration-supported-resource-agents.txt @@ -0,0 +1,147 @@ +ocf:heartbeat:CTDB +ocf:heartbeat:ClusterMon +ocf:heartbeat:Delay +ocf:heartbeat:Dummy +ocf:heartbeat:Filesystem +ocf:heartbeat:IPaddr2 +ocf:heartbeat:IPsrcaddr +ocf:heartbeat:IPv6addr +ocf:heartbeat:LVM-activate +ocf:heartbeat:MailTo +ocf:heartbeat:NodeUtilization +ocf:heartbeat:Raid1 +ocf:heartbeat:Route +ocf:heartbeat:SAPDatabase +ocf:heartbeat:SAPInstance +ocf:heartbeat:SendArp +ocf:heartbeat:Squid +ocf:heartbeat:Stateful +ocf:heartbeat:VirtualDomain +ocf:heartbeat:WAS +ocf:heartbeat:WAS6 +ocf:heartbeat:Xinetd +ocf:heartbeat:aliyun-vpc-move-ip +ocf:heartbeat:apache +ocf:heartbeat:aws-vpc-move-ip +ocf:heartbeat:aws-vpc-route53 +ocf:heartbeat:awseip +ocf:heartbeat:awsvip +ocf:heartbeat:azure-events +ocf:heartbeat:azure-events-az +ocf:heartbeat:azure-lb +ocf:heartbeat:conntrackd +ocf:heartbeat:corosync-qnetd +ocf:heartbeat:crypt +ocf:heartbeat:db2 +ocf:heartbeat:dhcpd +ocf:heartbeat:docker +ocf:heartbeat:docker-compose +ocf:heartbeat:dummypy +ocf:heartbeat:ethmonitor +ocf:heartbeat:exportfs +ocf:heartbeat:galera +ocf:heartbeat:garbd +ocf:heartbeat:gcp-ilb +ocf:heartbeat:gcp-pd-move +ocf:heartbeat:gcp-vpc-move-ip +ocf:heartbeat:gcp-vpc-move-vip +ocf:heartbeat:iSCSILogicalUnit +ocf:heartbeat:iSCSITarget +ocf:heartbeat:iface-bridge +ocf:heartbeat:iface-macvlan +ocf:heartbeat:iface-vlan +ocf:heartbeat:ldirectord +ocf:heartbeat:lvmlockd +ocf:heartbeat:mariadb +ocf:heartbeat:mdraid +ocf:heartbeat:mpathpersist +ocf:heartbeat:mysql +ocf:heartbeat:mysql-proxy +ocf:heartbeat:named +ocf:heartbeat:nfsnotify +ocf:heartbeat:nfsserver +ocf:heartbeat:nginx +ocf:heartbeat:nvmet-namespace +ocf:heartbeat:nvmet-port +ocf:heartbeat:nvmet-subsystem +ocf:heartbeat:oraasm +ocf:heartbeat:oracle +ocf:heartbeat:oralsnr +ocf:heartbeat:osceip +ocf:heartbeat:ovsmonitor +ocf:heartbeat:pgagent +ocf:heartbeat:pgsql +ocf:heartbeat:podman +ocf:heartbeat:portblock +ocf:heartbeat:postfix +ocf:heartbeat:powervs-subnet +ocf:heartbeat:rabbitmq-cluster +ocf:heartbeat:rabbitmq-server-ha +ocf:heartbeat:redis +ocf:heartbeat:rsyncd +ocf:heartbeat:sfex +ocf:heartbeat:sg_persist +ocf:heartbeat:slapd +ocf:heartbeat:storage-mon +ocf:heartbeat:symlink +ocf:heartbeat:tomcat +ocf:suse:aws-vpc-move-ip +ocf:suse:SAPHanaController +ocf:suse:SAPHanaFilesystem +ocf:suse:SAPHanaTopology +stonith:fence_aliyun +stonith:fence_alom +stonith:fence_apc +stonith:fence_apc-snmp +stonith:fence_aws +stonith:fence_azure-arm +stonith:fence_bladecenter +stonith:fence_brocade +stonith:fence_cisco-mds +stonith:fence_cisco-ucs +stonith:fence_compute +stonith:fence_docker +stonith:fence_drac5 +stonith:fence_eaton-snmp +stonith:fence_eaton-ssh +stonith:fence_emerson +stonith:fence_eps +stonith:fence_gce +stonith:fence_hds-cb +stonith:fence_hpblade +stonith:fence_ibm-powervs +stonith:fence_ibm-vpc +stonith:fence_ibmblade +stonith:fence_ibmz +stonith:fence_ifmib +stonith:fence_ilo-moonshot +stonith:fence_ilo-mp +stonith:fence_ilo-ssh +stonith:fence_ilo2 +stonith:fence_intelmodular +stonith:fence_ipdu +stonith:fence_ipmilan +stonith:fence_ironic +stonith:fence_kdump +stonith:fence_ldom +stonith:fence_lpar +stonith:fence_mpath +stonith:fence_netio +stonith:fence_openstack +stonith:fence_pve +stonith:fence_raritan +stonith:fence_rcd-serial +stonith:fence_redfish +stonith:fence_rhevm +stonith:fence_rsa +stonith:fence_rsb +stonith:fence_sanbox2 +stonith:fence_sbd +stonith:fence_scsi +stonith:fence_vbox +stonith:fence_virsh +stonith:fence_vmware +stonith:fence_vmware-rest +stonith:fence_wti +stonith:fence_xenapi +stonith:fence_zvm diff --git a/crmsh/migration.py b/crmsh/migration.py new file mode 100644 index 000000000..014273b4f --- /dev/null +++ b/crmsh/migration.py @@ -0,0 +1,624 @@ +import argparse +import dataclasses +import importlib.resources +import itertools +import json +import logging +import os +import re +import shutil +import subprocess +import sys +import threading +import tempfile +import typing + +import lxml.etree + +from crmsh import cibquery +from crmsh import constants +from crmsh import corosync +from crmsh import corosync_config_format +from crmsh import parallax +from crmsh import service_manager +from crmsh import sh +from crmsh import utils +from crmsh import xmlutil +from crmsh.prun import prun + +logger = logging.getLogger(__name__) + + +SAP_HANA_RESOURCE_AGENTS = { + cibquery.ResourceAgent('ocf', 'suse', 'SAPHana'), + cibquery.ResourceAgent('ocf', 'suse', 'SAPHanaController'), + cibquery.ResourceAgent('ocf', 'suse', 'SAPHanaTopology'), +} + + +class MigrationFailure(Exception): + pass + + +class CheckResultHandler: + def log_info(self, fmt: str, *args): + raise NotImplementedError + + def handle_tip(self, title: str, details: typing.Iterable[str]): + raise NotImplementedError + + def handle_problem(self, is_fatal: bool, title: str, detail: typing.Iterable[str]): + raise NotImplementedError + + def end(self): + raise NotImplementedError + + +class CheckResultJsonHandler(CheckResultHandler): + def __init__(self, indent: typing.Optional[int] = None): + self._indent = indent + self.json_result = { + "pass": True, + "problems": [], + "tips": [], + } + def log_info(self, fmt: str, *args): + logger.debug(fmt, *args) + + def handle_tip(self, title: str, details: typing.Iterable[str]): + self.json_result["tips"].append({ + "title": title, + "descriptions": details if isinstance(details, list) else list(details), + }) + + def handle_problem(self, is_fatal: bool, title: str, detail: typing.Iterable[str]): + self.json_result["pass"] = False + self.json_result["problems"].append({ + "is_fatal": is_fatal, + "title": title, + "descriptions": detail if isinstance(detail, list) else list(detail), + }) + + def end(self): + json.dump( + self.json_result, + sys.stdout, + ensure_ascii=False, + indent=self._indent, + ) + sys.stdout.write('\n') + + +class CheckResultInteractiveHandler(CheckResultHandler): + def __init__(self): + self.has_problems = False + self.has_tips = False + + def log_info(self, fmt: str, *args): + self.write_in_color(sys.stdout, constants.GREEN, '[INFO] ') + print(fmt % args) + + def handle_problem(self, is_fatal: bool, title: str, details: typing.Iterable[str]): + self.has_problems = True + self.write_in_color(sys.stdout, constants.YELLOW, '[FAIL] ') + print(title) + for line in details: + sys.stdout.write(' ') + print(line) + if is_fatal: + raise MigrationFailure('Unable to start migration.') + + def handle_tip(self, title: str, details: typing.Iterable[str]): + self.has_tips = True + self.write_in_color(sys.stdout, constants.YELLOW, '[WARN] ') + print(title) + for line in details: + sys.stdout.write(' ') + print(line) + + @staticmethod + def write_in_color(f, color: str, text: str): + if f.isatty(): + f.write(color) + f.write(text) + f.write(constants.END) + else: + f.write(text) + + def end(self): + sys.stdout.write('\n') + + +def migrate(): + try: + match _check_impl(local=False, json='', summary=False): + case 0: + logger.info("This cluster works on SLES 16. No migration is needed.") + return 0 + case 1: + logger.info('Starting migration...') + migrate_corosync_conf() + logger.info('Finished migration.') + return 0 + case _: + raise MigrationFailure('Unable to start migration.') + except MigrationFailure as e: + logger.error('%s', e) + return 1 + + +def check(args: typing.Sequence[str]) -> int: + parser = argparse.ArgumentParser(args[0]) + parser.add_argument('--json', nargs='?', const='pretty', choices=['oneline', 'pretty']) + parser.add_argument('--local', action='store_true') + parsed_args = parser.parse_args(args[1:]) + ret = _check_impl(parsed_args.local or parsed_args.json is not None, parsed_args.json, parsed_args.json is None) + if not parsed_args.json: + print('****** summary ******') + match ret: + case 0: + CheckResultInteractiveHandler.write_in_color(sys.stdout, constants.GREEN, '[INFO]') + sys.stdout.write(' This cluster works on SLES 16. No migration is needed.\n') + case 1: + CheckResultInteractiveHandler.write_in_color(sys.stdout, constants.GREEN, '[INFO]') + sys.stdout.write(' Please run "crm cluster health sles16 --fix" on on any one of above nodes.\n') + CheckResultInteractiveHandler.write_in_color(sys.stdout, constants.GREEN, '[PASS]') + sys.stdout.write(' This cluster is good to migrate to SLES 16.\n') + case _: + CheckResultInteractiveHandler.write_in_color(sys.stdout, constants.RED, '[FAIL]') + sys.stdout.write(' The pacemaker cluster stack can not migrate to SLES 16.\n') + return ret + + +def _check_impl(local: bool, json: str, summary: bool) -> int: + """Return: + * 0 if the cluster is already on SLES 16. + * 1 if the cluster is good to migrate to SLES 16. + * 2 if the cluster is not good to migrate to SLES 16.""" + assert not summary or not bool(json) + assert local or not bool(json) + ret = 0 + if not local: + check_remote_yield = check_remote() + next(check_remote_yield) + else: + check_remote_yield = itertools.repeat(0) + match json: + case 'oneline': + handler = CheckResultJsonHandler() + case 'pretty': + handler = CheckResultJsonHandler(indent=2) + case _: + handler = CheckResultInteractiveHandler() + ret = 0 + if local: + check_remote_yield = itertools.repeat(0) + check_local(handler) + else: + check_remote_yield = check_remote() + next(check_remote_yield) + print('------ localhost ------') + check_local(handler) + print('\n------ cib ------') + check_global(handler) + handler.end() + match handler: + case CheckResultJsonHandler(): + if not handler.json_result["pass"]: + ret = 2 + elif handler.json_result['tips']: + ret = 1 + else: + ret = 0 + case CheckResultInteractiveHandler(): + if handler.has_problems: + ret = 2 + elif handler.has_tips: + ret = 1 + else: + ret = 0 + if check_remote_yield: + remote_ret = next(check_remote_yield) + if remote_ret > ret: + ret = remote_ret + return ret + + +def check_local(handler: CheckResultHandler): + check_dependency_version(handler) + check_service_status(handler) + check_unsupported_corosync_features(handler) + + +def check_remote(): + handler = CheckResultInteractiveHandler() + class CheckRemoteThread(threading.Thread): + def run(self): + self.result = prun.prun({ + node: 'crm cluster health sles16 --local --json=oneline' + for node in utils.list_cluster_nodes_except_me() + }) + prun_thread = CheckRemoteThread() + prun_thread.start() + yield + prun_thread.join() + ret = 0 + for host, result in prun_thread.result.items(): + sys.stdout.write(f'------ {host} ------\n') + match result: + case prun.SSHError() as e: + handler.write_in_color( + sys.stdout, constants.YELLOW, + str(e) + ) + sys.stdout.write('\n') + ret = 255 + case prun.ProcessResult() as result: + try: + check_result = json.loads(result.stdout.decode('utf-8')) + except (UnicodeDecodeError, json.JSONDecodeError): + print(result.stdout.decode('utf-8', 'backslashreplace')) + handler.write_in_color( + sys.stderr, constants.YELLOW, + result.stderr.decode('utf-8', 'backslashreplace') + ) + sys.stdout.write('\n') + # cannot pass the exit status through, + # as all failed exit status become 1 in ui_context.Context.run() + ret = 2 + else: + passed = check_result.get("pass", False) + handler = CheckResultInteractiveHandler() + for problem in check_result.get("problems", list()): + handler.handle_problem(False, problem.get("title", ""), problem.get("descriptions")) + tips = check_result.get('tips', list()) + for tip in tips: + handler.handle_tip(tip.get("title", ""), tip.get("descriptions")) + handler.end() + if not passed: + ret = 2 + elif tips: + ret = 1 + else: + ret = result.returncode + yield ret + + +def check_global(handler: CheckResultHandler): + check_unsupported_resource_agents(handler) + + +def check_dependency_version(handler: CheckResultHandler): + handler.log_info('Checking dependency version...') + shell = sh.LocalShell() + out = shell.get_stdout_or_raise_error(None, 'corosync -v') + _check_version_range( + handler, + 'Corosync', (3,), + re.compile(r"version\s+'(\d+(?:\.\d+)*)'"), + shell.get_stdout_or_raise_error(None, 'corosync -v'), + ) + + +def _check_version_range( + handler: CheckResultHandler, component_name: str, + minimum: tuple, + pattern, + text: str, +): + match = pattern.search(text) + if not match: + handler.handle_problem( + False, f'{component_name} version not supported', [ + 'Unknown version:', + text, + ], + ) + else: + version = tuple(int(x) for x in match.group(1).split('.')) + if not minimum <= version: + handler.handle_problem( + False, f'{component_name} version not supported', [ + 'Supported version: {} <= {}'.format( + '.'.join(str(x) for x in minimum), + component_name, + ), + f'Actual version: {component_name} == {match.group(1)}', + ], + ) + + +def check_service_status(handler: CheckResultHandler): + handler.log_info('Checking service status...') + manager = service_manager.ServiceManager() + active_services = [x for x in ['corosync', 'pacemaker'] if manager.service_is_active(x)] + if active_services: + handler.handle_problem(False, 'Cluster services are running', (f'* {x}' for x in active_services)) + + +def check_unsupported_corosync_features(handler: CheckResultHandler): + handler.log_info("Checking used corosync features...") + conf_path = corosync.conf() + with open(conf_path, 'r', encoding='utf-8') as f: + config = corosync_config_format.DomParser(f).dom() + if config['totem'].get('rrp_mode', None) in {'active', 'passive'}: + handler.handle_tip('Corosync RRP will be deprecated in corosync 3.', [ + 'Run "crm health sles16 --fix" to migrate it to knet multilink.', + ]) + _check_unsupported_corosync_transport(handler, config) + + +def _check_unsupported_corosync_transport(handler: CheckResultHandler, dom): + transport = dom['totem'].get('transport', None) + if transport == 'knet': + return + if transport is None: + try: + dom['totem']['interface'][0]['bindnetaddr'] + except KeyError: + # looks like a corosync 3 config + return + handler.handle_tip(f'Corosync transport "{transport}" will be deprecated in corosync 3. Please use knet.', [ + ]) + + +def migrate_corosync_conf(): + conf_path = corosync.conf() + with open(conf_path, 'r', encoding='utf-8') as f: + config = corosync_config_format.DomParser(f).dom() + logger.info('Migrating corosync configuration...') + migrate_corosync_conf_impl(config) + shutil.copy(conf_path, conf_path + '.bak') + with utils.open_atomic(conf_path, 'w', fsync=True, encoding='utf-8') as f: + corosync_config_format.DomSerializer(config, f) + os.fchmod(f.fileno(), 0o644) + logger.info('Finish migrating corosync configuration.') + for host, result in prun.pcopy_to_remote(conf_path, utils.list_cluster_nodes_except_me(), conf_path).items(): + match result: + case None: + pass + case prun.PRunError() as e: + logger.error("Failed to copy crmsh.conf to host %s: %s", host, e) + + +def migrate_corosync_conf_impl(config): + assert 'totem' in config + corosync.ConfParser.transform_dom_with_list_schema(config) + migrate_transport(config) + migrate_crypto(config) + migrate_rrp(config) + # TODO: other migrations + + +def migrate_transport(dom): + match dom['totem'].get('transport', None): + case 'knet': + return + case 'udpu': + migrate_udpu(dom) + case 'udp': + migrate_multicast(dom) + case _: + # corosync 2 defaults to "udp" + try: + dom['totem']['interface'][0]['bindnetaddr'] + except KeyError: + # looks like a corosync 3 config + pass + if 'nodelist' not in dom: + migrate_multicast(dom) + else: + # looks like a corosync 3 config + pass + + +def migrate_udpu(dom): + dom['totem']['transport'] = 'knet' + if 'interface' in dom['totem']: + for interface in dom['totem']['interface']: + # remove udp-only items + interface.pop('mcastaddr', None) + interface.pop('bindnetaddr', None) + interface.pop('broadcast', None) + interface.pop('ttl', None) + ringnumber = interface.pop('ringnumber', None) + if ringnumber is not None: + interface['linknumber'] = ringnumber + if 'quorum' in dom: + dom['quorum'].pop('expected_votes', None) + logger.info("Upgrade totem.transport to knet.") + + +def migrate_multicast(dom): + dom['totem']['transport'] = 'knet' + for interface in dom['totem']['interface']: + # remove udp-only items + interface.pop('mcastaddr', None) + interface.pop('bindnetaddr', None) + interface.pop('broadcast', None) + interface.pop('ttl', None) + ringnumber = interface.pop('ringnumber', None) + if ringnumber is not None: + interface['linknumber'] = ringnumber + logger.info("Generating nodelist according to CIB...") + with open(constants.CIB_RAW_FILE, 'rb') as f: + cib = Cib(f) + cib_nodes = cib.nodes() + assert 'nodelist' not in dom + nodelist = list() + with tempfile.TemporaryDirectory(prefix='crmsh-migration-') as dir_name: + node_configs = { + x[0]: x[1] + for x in parallax.parallax_slurp([x.uname for x in cib_nodes], dir_name, corosync.conf()) + } + for node in cib_nodes: + assert node.uname in node_configs + with open(node_configs[node.uname], 'r', encoding='utf-8') as f: + root = corosync_config_format.DomParser(f).dom() + corosync.ConfParser.transform_dom_with_list_schema(root) + interfaces = root['totem']['interface'] + addresses = {f'ring{i}_addr': x['bindnetaddr'] for i, x in enumerate(interfaces)} + logger.info("Node %s: %s: %s", node.node_id, node.uname, addresses) + nodelist.append({ + 'nodeid': node.node_id, + 'name': node.uname, + } | addresses) + dom['nodelist'] = {'node': nodelist} + if 'quorum' in dom: + dom['quorum'].pop('expected_votes', None) + logger.info("Unset quorum.expected_votes.") + logger.info("Upgrade totem.transport to knet.") + + +def migrate_crypto(dom): + try: + # corosync 3 change the default hash algorithm to sha256 when `secauth` is enabled + if dom['totem'].get('crypto_hash', None) == 'sha1': + dom['totem']['crypto_hash'] = 'sha256' + logger.info('Upgrade totem.crypto_hash from "sha1" to "sha256".') + except KeyError: + dom['totem']['crypto_hash'] = 'sha256' + + +def migrate_rrp(dom): + try: + nodes = dom['nodelist']['node'] + except KeyError: + return + is_rrp = any('ring1_addr' in node for node in nodes) + if not is_rrp: + return + try: + rrp_mode = dom['totem']['rrp_mode'] + del dom['totem']['rrp_mode'] + if rrp_mode == 'active': + dom['totem']['link_mode'] = 'active' + except KeyError: + pass + assert all('nodeid' in node for node in nodes) + if any('name' not in node for node in nodes): + populate_node_name(nodes) + + +def populate_node_name(nodelist): + # cannot use utils.list_cluster_nodes, as pacemaker is not running + with open(constants.CIB_RAW_FILE, 'rb') as f: + cib = Cib(f) + cib_nodes = {node.node_id: node for node in cib.nodes()} + for node in nodelist: + node_id = int(node['nodeid']) + node['name'] = cib_nodes[node_id].uname + + +class Cib: + class Node: + def __init__(self, node_id: int, uname: str): + self.node_id = node_id + self.uname = uname + + def __init__(self, f: typing.IO): + self._dom = lxml.etree.parse(f) + + def nodes(self): + result = list() + for element in self._dom.xpath(constants.XML_NODE_PATH): + if element.get('type') == 'remote': + xpath = f"//primitive[@provider='pacemaker' and @type='remote']/instance_attributes/nvpair[@name='server' and @value='{name}']" + if self._dom.xpath(xpath): + continue + node_id = element.get('id') + uname = element.get('uname') + assert node_id + assert uname + result.append(Cib.Node(int(node_id), uname)) + return result + + +def check_unsupported_resource_agents(handler: CheckResultHandler): + handler.log_info("Checking used resource agents...") + ocf_resource_agents = list() + stonith_resource_agents = list() + cib = xmlutil.text2elem(sh.LocalShell().get_stdout_or_raise_error(None, 'crm configure show xml')) + for resource_agent in cibquery.get_configured_resource_agents(cib): + if resource_agent.m_class == 'ocf': + ocf_resource_agents.append(resource_agent) + elif resource_agent.m_class == 'stonith': + if resource_agent.m_type == 'external/sbd': + handler.handle_tip('stonith:external/sbd will be removed. Please use stonith:fence_sbd', [ + ]) + else: + stonith_resource_agents.append(resource_agent) + else: + raise ValueError(f'Unrecognized resource agent {resource_agent}') + _check_saphana_resource_agent(handler, ocf_resource_agents) + class TitledCheckResourceHandler(CheckResultHandler): + def __init__(self, parent: CheckResultHandler, title: str): + self._parent = parent + self._title= title + def log_info(self, fmt: str, *args): + return self._parent.log_info(fmt, *args) + def handle_problem(self, is_fatal: bool, title: str, detail: typing.Iterable[str]): + return self._parent.handle_problem(is_fatal, self._title, detail) + def handle_tip(self, title: str, details: typing.Iterable[str]): + return self._parent.handle_tip(self._title, details) + supported_resource_agents = _load_supported_resource_agents() + _check_removed_resource_agents( + TitledCheckResourceHandler(handler, "The following resource agents will be removed in SLES 16."), + supported_resource_agents, + (agent for agent in ocf_resource_agents if agent not in SAP_HANA_RESOURCE_AGENTS), + ) + _check_removed_resource_agents( + TitledCheckResourceHandler(handler, "The following fence agents will be removed in SLES 16."), + supported_resource_agents, + stonith_resource_agents, + ) + _check_ocfs2(handler, cib) + + +def _check_saphana_resource_agent(handler: CheckResultHandler, resource_agents: typing.Iterable[cibquery.ResourceAgent]): + # "SAPHana" appears only in SAPHanaSR Classic + has_sap_hana_sr_resources = any(agent in SAP_HANA_RESOURCE_AGENTS for agent in resource_agents) + if has_sap_hana_sr_resources: + if 0 != subprocess.run( + ['rpm', '-q', 'SAPHanaSR-angi'], + stdin=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ).returncode: + handler.handle_problem(False, 'SAPHanaSR Classic will be removed in SLES 16.', [ + 'Before migrating to SLES 16, replace it with SAPHanaSR-angi.', + ]) + + +def _load_supported_resource_agents() -> typing.Set[cibquery.ResourceAgent]: + ret = set() + with importlib.resources.files('crmsh').joinpath('migration-supported-resource-agents.txt').open( + 'r', encoding='ascii', + ) as r: + for line in r: + parts = line.strip().split(':', 3) + m_class = parts[0] + m_provider = parts[1] if len(parts) == 3 else None + m_type = parts[-1] + ret.add(cibquery.ResourceAgent(m_class, m_provider, m_type)) + return ret + + + +def _check_removed_resource_agents( + handler: CheckResultHandler, + supported_resource_agents: typing.Set[cibquery.ResourceAgent], + resource_agents: typing.Iterable[cibquery.ResourceAgent], +): + unsupported_resource_agents = [x for x in resource_agents if x not in supported_resource_agents] + if unsupported_resource_agents: + handler.handle_problem(False, '', [ + '* ' + ':'.join(x for x in dataclasses.astuple(resource_agent) if x is not None) + for resource_agent in unsupported_resource_agents + ]) + + +def _check_ocfs2(handler: CheckResultHandler, cib: lxml.etree.Element): + if cibquery.has_primitive_filesystem_with_fstype(cib, 'ocfs2'): + handler.handle_problem(False, 'OCFS2 is not supported in SLES 16. Please use GFS2.', [ + ]) diff --git a/crmsh/prun/runner.py b/crmsh/prun/runner.py index fc5f577c5..12682f4f7 100644 --- a/crmsh/prun/runner.py +++ b/crmsh/prun/runner.py @@ -1,5 +1,6 @@ # runner.py - fork and exec multiple child processes concurrently import asyncio +import contextlib import fcntl import os import select @@ -46,16 +47,18 @@ def add_task(self, task: Task): self._tasks.append(task) def run(self, timeout_seconds: int = -1): - awaitable = asyncio.gather( - *[ - self._concurrency_limit(self._concurrency_limiter, self._run(task)) - for task in self._tasks - ], - return_exceptions=True, - ) - if timeout_seconds > 0: - awaitable = self._timeout_limit(timeout_seconds, awaitable) - return asyncio.get_event_loop_policy().get_event_loop().run_until_complete(awaitable) + with contextlib.closing(asyncio.new_event_loop()) as event_loop: + asyncio.set_event_loop(event_loop) + awaitable = asyncio.gather( + *[ + self._concurrency_limit(self._concurrency_limiter, self._run(task)) + for task in self._tasks + ], + return_exceptions=True, + ) + if timeout_seconds > 0: + awaitable = self._timeout_limit(timeout_seconds, awaitable) + return event_loop.run_until_complete(awaitable) async def _timeout_limit(self, timeout_seconds: int, awaitable: typing.Awaitable): assert timeout_seconds > 0 diff --git a/crmsh/sh.py b/crmsh/sh.py index c4f537eb3..006e964ae 100644 --- a/crmsh/sh.py +++ b/crmsh/sh.py @@ -206,7 +206,7 @@ def get_stdout_or_raise_error( user: typing.Optional[str], cmd: str, success_exit_status: typing.Optional[typing.Set[int]] = None, - ): + ) -> str: result = self.su_subprocess_run( user, cmd, stdout=subprocess.PIPE, diff --git a/crmsh/ui_cluster.py b/crmsh/ui_cluster.py index 45a471694..5abee210e 100644 --- a/crmsh/ui_cluster.py +++ b/crmsh/ui_cluster.py @@ -9,7 +9,7 @@ from argparse import ArgumentParser, RawDescriptionHelpFormatter import crmsh.parallax -from . import command, sh, healthcheck +from . import command, sh, healthcheck, migration from . import utils from . import scripts from . import completers as compl @@ -771,45 +771,64 @@ def do_geo_init_arbitrator(self, context, *args): bootstrap.bootstrap_arbitrator(geo_context) return True - @command.completers(compl.choice(['hawk2'])) + @command.completers(compl.choice([ + 'hawk2', + 'sles16', + ])) def do_health(self, context, *args): ''' Extensive health check. ''' if not args: return self._do_health_legacy(context, *args) - parser = argparse.ArgumentParser() - parser.add_argument('component', choices=['hawk2']) + parser = argparse.ArgumentParser('health') + parser.add_argument('component', choices=['hawk2', 'sles16']) parser.add_argument('-f', '--fix', action='store_true') - parsed_args = parser.parse_args(args) - if parsed_args.component == 'hawk2': - nodes = utils.list_cluster_nodes() - if parsed_args.fix: - if not healthcheck.feature_full_check(healthcheck.PasswordlessPrimaryUserAuthenticationFeature(), nodes): + parsed_args, remaining_args = parser.parse_known_args(args) + match parsed_args.component: + case 'hawk2': + if remaining_args: + logger.error('Known arguments: %s', ' '.join(remaining_args)) + return False + nodes = utils.list_cluster_nodes() + if parsed_args.fix: + if not healthcheck.feature_full_check(healthcheck.PasswordlessPrimaryUserAuthenticationFeature(), nodes): + try: + healthcheck.feature_fix( + healthcheck.PasswordlessPrimaryUserAuthenticationFeature(), + nodes, + utils.ask, + ) + except healthcheck.FixFailure: + logger.error('Cannot fix automatically.') + return False try: - healthcheck.feature_fix( - healthcheck.PasswordlessPrimaryUserAuthenticationFeature(), - nodes, - utils.ask, - ) + healthcheck.feature_fix(healthcheck.PasswordlessHaclusterAuthenticationFeature(), nodes, utils.ask) + logger.info("hawk2: passwordless ssh authentication: OK.") + return True except healthcheck.FixFailure: - logger.error('Cannot fix automatically.') + logger.error("hawk2: passwordless ssh authentication: FAIL.") return False - try: - healthcheck.feature_fix(healthcheck.PasswordlessHaclusterAuthenticationFeature(), nodes, utils.ask) - logger.info("hawk2: passwordless ssh authentication: OK.") - return True - except healthcheck.FixFailure: - logger.error("hawk2: passwordless ssh authentication: FAIL.") - return False - else: - if healthcheck.feature_full_check(healthcheck.PasswordlessHaclusterAuthenticationFeature(), nodes): - logger.info("hawk2: passwordless ssh authentication: OK.") - return True else: - logger.error("hawk2: passwordless ssh authentication: FAIL.") - logger.warning('Please run "crm cluster health hawk2 --fix"') + if healthcheck.feature_full_check(healthcheck.PasswordlessHaclusterAuthenticationFeature(), nodes): + logger.info("hawk2: passwordless ssh authentication: OK.") + return True + else: + logger.error("hawk2: passwordless ssh authentication: FAIL.") + logger.warning('Please run "crm cluster health hawk2 --fix"') + return False + case 'sles16': + try: + if parsed_args.fix: + migration.migrate() + else: + return 0 == migration.check(['sles16'] + remaining_args) + except migration.MigrationFailure as e: + logger.error('%s', e) return False + case _: + logger.error('Unknown component: %s', parsed_args.component) + return False def _do_health_legacy(self, context, *args): params = self._args_implicit(context, args, 'nodes') diff --git a/crmsh/xmlutil.py b/crmsh/xmlutil.py index f85a7d749..eddf8b51c 100644 --- a/crmsh/xmlutil.py +++ b/crmsh/xmlutil.py @@ -4,6 +4,8 @@ import os import subprocess +import typing + from lxml import etree, doctestcompare import copy import bz2 @@ -26,7 +28,7 @@ logger_utils = log.LoggerUtils(logger) -def xmlparse(f): +def xmlparse(f: typing.IO[typing.AnyStr]) -> etree.Element: try: cib_elem = etree.parse(f).getroot() except Exception as msg: @@ -122,7 +124,7 @@ def cibdump2tmp(): return None -def text2elem(text): +def text2elem(text: str) -> etree.Element: """ Convert a text format CIB to an XML tree. diff --git a/data-manifest b/data-manifest index 1b549b114..ce5f25d4d 100644 --- a/data-manifest +++ b/data-manifest @@ -83,6 +83,7 @@ test/features/environment.py test/features/geo_setup.feature test/features/gfs2.feature test/features/healthcheck.feature +test/features/migration.feature test/features/ocfs2.feature test/features/qdevice_options.feature test/features/qdevice_setup_remove.feature @@ -187,6 +188,7 @@ test/unittests/scripts/workflows/10-webserver.xml test/unittests/test_bootstrap.py test/unittests/test_bugs.py test/unittests/test_cib.py +test/unittests/test_cibquery.py test/unittests/test_cliformat.py test/unittests/test_cluster_fs.py test/unittests/test.conf @@ -199,6 +201,7 @@ test/unittests/test_crashtest_utils.py test/unittests/test_gv.py test/unittests/test_handles.py test/unittests/test_lock.py +test/unittests/test_migration.py test/unittests/test_objset.py test/unittests/test_parallax.py test/unittests/test_parse.py diff --git a/doc/crm.8.adoc b/doc/crm.8.adoc index 7a165f56c..b4ce1f733 100644 --- a/doc/crm.8.adoc +++ b/doc/crm.8.adoc @@ -996,14 +996,34 @@ See "crm cluster help geo_join" or "crm cluster geo_join --help" [[cmdhelp.cluster.health,Cluster health check]] ==== `health` +Usage 1: General Health Check + Runs a larger set of tests and queries on all nodes in the cluster to verify the general system health and detect potential problems. -Usage: ............... health ............... +Usage 2: Topic-Specified Health Check + +Verifies the health of a specified topic. + +............... +health hawk2|sles16 [--fix] +............... + +* `hawk2`: check or fix key-based ssh authentication for user hacluster, which +is needed by hawk2. +* `sles16`: check whether the cluster is good to migrate to SLES 16. + +The optional `--fix` argument attempts to automatically resolve any detected +issues. + +.Note on sles16 +**************************** +`--fix` is only available after the OS is migrated to SLES 16. + [[cmdhelp.cluster.init,Initializes a new HA cluster,From Code]] ==== `init` See "crm cluster help init" or "crm cluster init --help" diff --git a/etc/profiles.yml b/etc/profiles.yml index ad66cac5a..3ab980c70 100644 --- a/etc/profiles.yml +++ b/etc/profiles.yml @@ -23,7 +23,7 @@ default: sbd.watchdog_timeout: 15 knet-default: - corosync.totem.crypto_hash: sha1 + corosync.totem.crypto_hash: sha256 corosync.totem.crypto_cipher: aes256 microsoft-azure: diff --git a/test/features/migration.feature b/test/features/migration.feature new file mode 100644 index 000000000..0f2c52818 --- /dev/null +++ b/test/features/migration.feature @@ -0,0 +1,104 @@ +# vim: sw=2 sts=2 +Feature: migration + + Test migration and pre-migration checks + Need nodes: hanode1 hanode2 + + Scenario: Run pre-migration checks when cluster services are running. + Given Cluster service is "stopped" on "hanode1" + And Cluster service is "stopped" on "hanode2" + And Run "crm cluster init -y -N hanode2" OK on "hanode1" + When Try "crm cluster health sles16" on "hanode1" + Then Expected return code is "1" + And Expected "[FAIL] Cluster services are running" in stdout + + Scenario: Run pre-migration checks with cluster services stopped. + When Run "crm cluster stop --all" on "hanode1" + And Run "crm cluster stop --all" on "hanode2" + And Try "crm cluster health sles16" on "hanode1" + Then Expected return code is "0" + And Expected "[INFO] This cluster works on SLES 16. No migration is needed." in stdout + + + Scenario: Should run fixes. + When Try "crm cluster health sles16 --fix" on "hanode1" + Then Expected return code is "0" + + Scenario: run pre-migration checks against corosync.conf generated in crmsh-4.6 + When Run "rm -f /etc/corosync/corosync.conf" on "hanode1" + And Write multi lines to file "/etc/corosync/corosync.conf" on "hanode1" + """ + # Please read the corosync.conf.5 manual page + totem { + version: 2 + cluster_name: hacluster + clear_node_high_bit: yes + interface { + ringnumber: 0 + mcastport: 5405 + ttl: 1 + } + + transport: udpu + crypto_hash: sha1 + crypto_cipher: aes256 + token: 5000 + join: 60 + max_messages: 20 + token_retransmits_before_loss_const: 10 + } + + logging { + fileline: off + to_stderr: no + to_logfile: yes + logfile: /var/log/cluster/corosync.log + to_syslog: yes + debug: off + timestamp: on + logger_subsys { + subsys: QUORUM + debug: off + } + + } + + quorum { + + # Enable and configure quorum subsystem (default: off) + # see also corosync.conf.5 and votequorum.5 + provider: corosync_votequorum + expected_votes: 2 + two_node: 1 + } + + nodelist { + node { + ring0_addr: @hanode1.ip.0 + name: hanode1 + nodeid: 1 + } + + node { + ring0_addr: @hanode2.ip.0 + name: hanode2 + nodeid: 2 + } + + } + """ + And Run "crm cluster copy /etc/corosync/corosync.conf" on "hanode1" + And Try "crm cluster health sles16" on "hanode1" + Then Expected return code is "1" + And Expect stdout contains snippets ["[PASS] This cluster is good to migrate to SLES 16.", "[INFO] Please run \"crm cluster health sles16 --fix\" on on any one of above nodes.", "[WARN] Corosync transport \"udpu\" will be deprecated in corosync 3. Please use knet.", "----- localhost -----", "----- hanode2 -----"]. + + Scenario: Run fixes against corosync.conf generated in crmsh-4.6 + When Try "crm cluster health sles16 --fix" on "hanode1" + Then Expected return code is "0" + + Scenario: Run pre-migration checks when some of the nodes are offline. + When Run "systemctl stop sshd" on "hanode2" + And Try "crm cluster health sles16" on "hanode1" + Then Expected return code is "1" + And Expect stdout contains snippets ["Cannot create SSH connection to", "----- localhost -----", "----- hanode2 -----"]. + diff --git a/test/features/steps/step_implementation.py b/test/features/steps/step_implementation.py index 0a8a5063d..b44d1660c 100644 --- a/test/features/steps/step_implementation.py +++ b/test/features/steps/step_implementation.py @@ -1,3 +1,4 @@ +import ast import re import time import os @@ -18,8 +19,8 @@ def _parse_str(text): - return text[1:-1].encode('utf-8').decode('unicode_escape') -_parse_str.pattern='".*"' + return ast.literal_eval(text) +_parse_str.pattern='"([^"]|\\")*?"' behave.use_step_matcher("cfparse") @@ -170,6 +171,13 @@ def step_impl(context, msg): context.stderr = None +@then('Expect stdout contains snippets [{snippets:str+}].') +def step_impl(context, snippets): + for snippet in snippets: + assert_in(snippet, context.stdout) + context.stdout = None + + @then('Expected regex "{reg_str}" in stdout') def step_impl(context, reg_str): res = re.search(reg_str, context.stdout) diff --git a/test/run-functional-tests b/test/run-functional-tests index dbdc25545..20dfb2018 100755 --- a/test/run-functional-tests +++ b/test/run-functional-tests @@ -228,8 +228,6 @@ deploy_ha_node() { create_node() { - info "Loading container image $CONTAINER_IMAGE..." - echo 16777216 > /proc/sys/net/core/rmem_max echo 16777216 > /proc/sys/net/core/wmem_max info "Create ha specific container networks..." diff --git a/test/unittests/test_cibquery.py b/test/unittests/test_cibquery.py new file mode 100644 index 000000000..e8b7e268f --- /dev/null +++ b/test/unittests/test_cibquery.py @@ -0,0 +1,110 @@ +import unittest + +import lxml.etree + +from crmsh import cibquery +from crmsh.cibquery import ResourceAgent + + +class TestDataObjectResourceAgent(unittest.TestCase): + def test_eq(self): + self.assertEqual(ResourceAgent('foo', None, 'bar'), ResourceAgent('foo', None, 'bar')) + + def test_set_eq(self): + self.assertSetEqual({ResourceAgent('foo', None, 'bar')}, {ResourceAgent('foo', None, 'bar')}) + + +class TestCibQuery(unittest.TestCase): + _TEST_DATA = """ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +""" + + def setUp(self) -> None: + self.cib = lxml.etree.fromstring(self._TEST_DATA) + + def test_get_resource_agents(self): + self.assertSetEqual( + { + ResourceAgent('ocf', 'heartbeat', 'IPaddr2'), + ResourceAgent('stonith', None, 'external/sbd'), + ResourceAgent('ocf', 'heartbeat', 'Filesystem'), + ResourceAgent('ocf', 'pacemaker', 'controld'), + }, + cibquery.get_configured_resource_agents(self.cib), + ) + + def test_has_primitive_filesystem_with_fstype(self): + self.assertTrue(cibquery.has_primitive_filesystem_with_fstype(self.cib, 'ocfs2')) + self.assertFalse(cibquery.has_primitive_filesystem_with_fstype(self.cib, 'foo')) diff --git a/test/unittests/test_migration.py b/test/unittests/test_migration.py new file mode 100644 index 000000000..fdceedb88 --- /dev/null +++ b/test/unittests/test_migration.py @@ -0,0 +1,58 @@ +import re +import unittest +from unittest import mock + +from crmsh import migration, cibquery + + +class TestCheckRemovedResourceAgents(unittest.TestCase): + def setUp(self): + self._handler = mock.Mock(migration.CheckResultHandler) + + def test_load_supported_resource_agents(self): + s = migration._load_supported_resource_agents() + self.assertIn(cibquery.ResourceAgent('ocf', 'heartbeat', 'IPaddr2'), s) + self.assertIn(cibquery.ResourceAgent('stonith', None, 'fence_sbd'), s) + self.assertNotIn(cibquery.ResourceAgent('foo', None, 'bar'), s) + + def test_check_removed_resource_agents(self): + migration._check_removed_resource_agents( + self._handler, + {cibquery.ResourceAgent('foo', None, 'bar')}, + [cibquery.ResourceAgent('foo', 'bar', 'qux')] + ) + self._handler.handle_problem.assert_called() + + def test_check_version_range(self): + def check_fn(x): + migration._check_version_range( + self._handler, + 'foo', + (1, 1,), + re.compile(r'^foo\s+(\d+(?:.\d+)*)'), + x, + ) + check_fn('foo 0') + self._handler.handle_problem.assert_called() + self._handler.handle_problem.reset_mock() + check_fn('foo 0.9') + self._handler.handle_problem.assert_called() + self._handler.handle_problem.reset_mock() + check_fn('foo 0.9.99') + self._handler.handle_problem.assert_called() + self._handler.handle_problem.reset_mock() + check_fn('foo 1') + self._handler.handle_problem.assert_called() + self._handler.handle_problem.reset_mock() + check_fn('foo 1.1') + self._handler.handle_problem.assert_not_called() + check_fn('foo 1.1.0') + self._handler.handle_problem.assert_not_called() + check_fn('foo 1.1.1') + self._handler.handle_problem.assert_not_called() + check_fn('foo 1.2') + self._handler.handle_problem.assert_not_called() + check_fn('foo 2') + self._handler.handle_problem.assert_not_called() + check_fn('foo 2.0') + self._handler.handle_problem.assert_not_called()