diff --git a/src/core/resource.cc b/src/core/resource.cc index 3464adc38f4..d6364d6fa00 100644 --- a/src/core/resource.cc +++ b/src/core/resource.cc @@ -228,6 +228,10 @@ optional read_setting_V1V2_as(std::string cg1_path, std::string cg2_fname) { namespace resource { +static unsigned long get_machine_memory_from_sysconf() { + return ::sysconf(_SC_PAGESIZE) * size_t(::sysconf(_SC_PHYS_PAGES)); +} + static size_t kernel_memory_reservation() { @@ -305,13 +309,25 @@ size_t div_roundup(size_t num, size_t denom) { return (num + denom - 1) / denom; } -static size_t alloc_from_node(cpu& this_cpu, hwloc_obj_t node, std::unordered_map& used_mem, size_t alloc) { +static hwloc_uint64_t get_memory_from_hwloc_obj(hwloc_obj_t obj) { #if HWLOC_API_VERSION >= 0x00020000 - // FIXME: support nodes with multiple NUMA nodes, whatever that means - auto local_memory = node->total_memory; + auto total_memory = obj->total_memory; #else - auto local_memory = node->memory.local_memory; + auto total_memory = obj->memory.total_memory; #endif + return total_memory; +} + +static void set_memory_to_hwloc_obj(hwloc_obj_t machine, hwloc_uint64_t memory) { +#if HWLOC_API_VERSION >= 0x00020000 + machine->total_memory = memory; +#else + machine->memory.total_memory = memory; +#endif +} + +static size_t alloc_from_node(cpu& this_cpu, hwloc_obj_t node, std::unordered_map& used_mem, size_t alloc) { + auto local_memory = get_memory_from_hwloc_obj(node); auto taken = std::min(local_memory - used_mem[node], alloc); if (taken) { used_mem[node] += taken; @@ -574,11 +590,13 @@ resources allocate(configuration& c) { auto machine_depth = hwloc_get_type_depth(topology, HWLOC_OBJ_MACHINE); assert(hwloc_get_nbobjs_by_depth(topology, machine_depth) == 1); auto machine = hwloc_get_obj_by_depth(topology, machine_depth, 0); -#if HWLOC_API_VERSION >= 0x00020000 - auto available_memory = machine->total_memory; -#else - auto available_memory = machine->memory.total_memory; -#endif + auto available_memory = get_memory_from_hwloc_obj(machine); + if (!available_memory) { + available_memory = get_machine_memory_from_sysconf(); + set_memory_to_hwloc_obj(machine, available_memory); + seastar_logger.warn("hwloc failed to detect machine-wide memory size, using memory size fetched from sysconf"); + } + size_t mem = calculate_memory(c, std::min(available_memory, cgroup::memory_limit())); // limit memory address to fit in 36-bit, see core/memory.cc:Memory map @@ -592,6 +610,7 @@ resources allocate(configuration& c) { std::vector> remains; auto cpu_sets = distribute_objects(topology, procs); + auto num_nodes = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_NUMANODE); for (auto&& cs : cpu_sets()) { auto cpu_id = hwloc_bitmap_first(cs); @@ -601,6 +620,16 @@ resources allocate(configuration& c) { if (node == nullptr) { orphan_pus.push_back(cpu_id); } else { + if (!get_memory_from_hwloc_obj(node)) { + // If hwloc fails to detect the hardware topology, it falls back to treating + // the system as a single-node configuration. While this code supports + // multi-node setups, the fallback behavior is safe and will function + // correctly in this case. + assert(num_nodes == 1); + auto local_memory = get_machine_memory_from_sysconf(); + set_memory_to_hwloc_obj(node, local_memory); + seastar_logger.warn("hwloc failed to detect NUMA node memory size, using memory size fetched from sysfs"); + } cpu_to_node[cpu_id] = node; seastar_logger.debug("Assign CPU{} to NUMA{}", cpu_id, node->os_index); } @@ -730,7 +759,7 @@ allocate_io_queues(configuration c, std::vector cpus) { resources allocate(configuration& c) { resources ret; - auto available_memory = ::sysconf(_SC_PAGESIZE) * size_t(::sysconf(_SC_PHYS_PAGES)); + auto available_memory = get_machine_memory_from_sysconf(); auto mem = calculate_memory(c, available_memory); auto procs = c.cpus; ret.cpus.reserve(procs);