From 63d95e917dc2d618dc6c54474b31692a37f6dd28 Mon Sep 17 00:00:00 2001 From: Takuya ASADA Date: Thu, 30 Jan 2025 03:59:12 +0000 Subject: [PATCH] resource: fallback to sysconf when failed to detect memory size from hwloc On Fedora 41 AMI on some aarch64 instance such as m7gd.16xlarge, Seastar program such as Scylla fails to startup with following error message: ``` $ /opt/scylladb/bin/scylla --log-to-stdout 1 WARNING: debug mode. Not for benchmarking or production hwloc/linux: failed to find sysfs cpu topology directory, aborting linux discovery. scylla: seastar/src/core/resource.cc:683: resources seastar::resource::allocate(configuration &): Assertion `!remain' failed. ``` It seems like hwloc is failed to initialize because of /sys/devices/system/cpu/cpu0/topology/ not available on the instance. I debugged src/core/resource.cc to find out why assert occured, and found that alloc_from_node() is failing because node->total_memory is 0. It is likely because of failure of hwloc initialize described above. I also found that calculate_memory() going wrong since machine->total_memory is also 0. To avoid the error on such environment, we need to fixup memory size on both machine->total_memory and node->total_memory. We can use sysconf(_SC_PAGESIZE) * sysconf(_SC_PHYS_PAGES) for this, just like we do on non-hwloc version of allocate(). Fixes scylladb/scylladb#22382 Related scylladb/scylla-pkg#4797 --- src/core/resource.cc | 49 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 10 deletions(-) diff --git a/src/core/resource.cc b/src/core/resource.cc index 3464adc38f4..d6364d6fa00 100644 --- a/src/core/resource.cc +++ b/src/core/resource.cc @@ -228,6 +228,10 @@ optional read_setting_V1V2_as(std::string cg1_path, std::string cg2_fname) { namespace resource { +static unsigned long get_machine_memory_from_sysconf() { + return ::sysconf(_SC_PAGESIZE) * size_t(::sysconf(_SC_PHYS_PAGES)); +} + static size_t kernel_memory_reservation() { @@ -305,13 +309,25 @@ size_t div_roundup(size_t num, size_t denom) { return (num + denom - 1) / denom; } -static size_t alloc_from_node(cpu& this_cpu, hwloc_obj_t node, std::unordered_map& used_mem, size_t alloc) { +static hwloc_uint64_t get_memory_from_hwloc_obj(hwloc_obj_t obj) { #if HWLOC_API_VERSION >= 0x00020000 - // FIXME: support nodes with multiple NUMA nodes, whatever that means - auto local_memory = node->total_memory; + auto total_memory = obj->total_memory; #else - auto local_memory = node->memory.local_memory; + auto total_memory = obj->memory.total_memory; #endif + return total_memory; +} + +static void set_memory_to_hwloc_obj(hwloc_obj_t machine, hwloc_uint64_t memory) { +#if HWLOC_API_VERSION >= 0x00020000 + machine->total_memory = memory; +#else + machine->memory.total_memory = memory; +#endif +} + +static size_t alloc_from_node(cpu& this_cpu, hwloc_obj_t node, std::unordered_map& used_mem, size_t alloc) { + auto local_memory = get_memory_from_hwloc_obj(node); auto taken = std::min(local_memory - used_mem[node], alloc); if (taken) { used_mem[node] += taken; @@ -574,11 +590,13 @@ resources allocate(configuration& c) { auto machine_depth = hwloc_get_type_depth(topology, HWLOC_OBJ_MACHINE); assert(hwloc_get_nbobjs_by_depth(topology, machine_depth) == 1); auto machine = hwloc_get_obj_by_depth(topology, machine_depth, 0); -#if HWLOC_API_VERSION >= 0x00020000 - auto available_memory = machine->total_memory; -#else - auto available_memory = machine->memory.total_memory; -#endif + auto available_memory = get_memory_from_hwloc_obj(machine); + if (!available_memory) { + available_memory = get_machine_memory_from_sysconf(); + set_memory_to_hwloc_obj(machine, available_memory); + seastar_logger.warn("hwloc failed to detect machine-wide memory size, using memory size fetched from sysconf"); + } + size_t mem = calculate_memory(c, std::min(available_memory, cgroup::memory_limit())); // limit memory address to fit in 36-bit, see core/memory.cc:Memory map @@ -592,6 +610,7 @@ resources allocate(configuration& c) { std::vector> remains; auto cpu_sets = distribute_objects(topology, procs); + auto num_nodes = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_NUMANODE); for (auto&& cs : cpu_sets()) { auto cpu_id = hwloc_bitmap_first(cs); @@ -601,6 +620,16 @@ resources allocate(configuration& c) { if (node == nullptr) { orphan_pus.push_back(cpu_id); } else { + if (!get_memory_from_hwloc_obj(node)) { + // If hwloc fails to detect the hardware topology, it falls back to treating + // the system as a single-node configuration. While this code supports + // multi-node setups, the fallback behavior is safe and will function + // correctly in this case. + assert(num_nodes == 1); + auto local_memory = get_machine_memory_from_sysconf(); + set_memory_to_hwloc_obj(node, local_memory); + seastar_logger.warn("hwloc failed to detect NUMA node memory size, using memory size fetched from sysfs"); + } cpu_to_node[cpu_id] = node; seastar_logger.debug("Assign CPU{} to NUMA{}", cpu_id, node->os_index); } @@ -730,7 +759,7 @@ allocate_io_queues(configuration c, std::vector cpus) { resources allocate(configuration& c) { resources ret; - auto available_memory = ::sysconf(_SC_PAGESIZE) * size_t(::sysconf(_SC_PHYS_PAGES)); + auto available_memory = get_machine_memory_from_sysconf(); auto mem = calculate_memory(c, available_memory); auto procs = c.cpus; ret.cpus.reserve(procs);