From 63d98018ea77b9a5121b0875a2aaa87b9dee7a81 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Fri, 21 Jul 2023 13:38:31 -0500 Subject: [PATCH 1/5] arch-x86: Move CPUID values to python CPUID values for X86 are currently hard-coded in the C++ source file. This makes it difficult to configure the bits if needed. Move these to python instead. This will provide a few benefits: 1. We can enable features for certain configurations, for example AVX can be enabled when the KVM CPU is used, but otherwise should not be enabled as gem5 does not have full AVX support. 2. We can more accurately communicate things like cache/TLB sizes based on the actual gem5 configuration. The CPUID values are can be used by some libraries, e.g., MPI, to query system topology. 3. Enabling some bits breaks things in certain configurations and this can be prevented by configuring in python. For example, enabling AVX seems to currently be breaking SMP, meaning gem5 can only boot one CPU in that configuration. Change-Id: Ib3866f39c86d61374b9451e60b119a3155575884 --- src/arch/x86/X86ISA.py | 56 +++++ src/arch/x86/cpuid.cc | 229 +++++++----------- src/arch/x86/cpuid.hh | 81 +++++-- src/arch/x86/isa.cc | 15 +- src/arch/x86/isa.hh | 3 + src/arch/x86/isa/decoder/two_byte_opcodes.isa | 5 +- src/arch/x86/isa/includes.isa | 1 + src/arch/x86/kvm/x86_cpu.cc | 10 +- 8 files changed, 231 insertions(+), 169 deletions(-) diff --git a/src/arch/x86/X86ISA.py b/src/arch/x86/X86ISA.py index bb72c415e9..aa5c29a98e 100644 --- a/src/arch/x86/X86ISA.py +++ b/src/arch/x86/X86ISA.py @@ -54,3 +54,59 @@ class X86ISA(BaseISA): vendor_string = Param.String( "HygonGenuine", "Vendor string for CPUID instruction" ) + name_string = Param.String( + "Fake gem5 x86_64 CPU", "Processor name for CPUID instruction" + ) + + # For the functions that return numerical values we use a vector of ints. + # The order of the values is: EAX, EBX, EDX, ECX. + # + # If the CPU function can take an index, the index value is used as an + # offset into the vector and four numerical values are added for each + # possible index value. For example, if the function accepts 3 index + # values, there are 12 total ints in the vector param. In addition, the + # last values for functions which take an index must be all zeros. All + # zeros indicates to the KVM cpu / OS that there are no more index values + # to iterate over. + # + # A good resource for these values can be found here: + # https://sandpile.org/x86/cpuid.htm + # 0000_0001h + FamilyModelStepping = VectorParam.UInt32( + [0x00020F51, 0x00000805, 0xEFDBFBFF, 0x00000209], + "type/family/model/stepping and feature flags", + ) + # 0000_0004h + CacheParams = VectorParam.UInt32( + [0x00000000, 0x00000000, 0x00000000, 0x00000000], + "cache configuration descriptors", + ) + # 0000_0007h + ExtendedFeatures = VectorParam.UInt32( + [0x00000000, 0x01800000, 0x00000000, 0x00000000], "feature flags" + ) + # 8000_0001h + FamilyModelSteppingBrandFeatures = VectorParam.UInt32( + [0x00020F51, 0x00000405, 0xEBD3FBFF, 0x00020001], + "family/model/stepping and features flags", + ) + # 8000_0005h + L1CacheAndTLB = VectorParam.UInt32( + [0xFF08FF08, 0xFF20FF20, 0x40020140, 0x40020140], + "L1 cache and L1 TLB configuration descriptors", + ) + # 8000_0006h + L2L3CacheAndL2TLB = VectorParam.UInt32( + [0x00000000, 0x42004200, 0x00000000, 0x04008140], + "L2/L3 cache and L2 TLB configuration descriptors", + ) + # 8000_0007h + APMInfo = VectorParam.UInt32( + [0x80000018, 0x68747541, 0x69746E65, 0x444D4163], + "processor feedback capabilities", + ) + # 8000_0008h + LongModeAddressSize = VectorParam.UInt32( + [0x00003030, 0x00000000, 0x00000000, 0x00000000], + "miscellaneous information", + ) diff --git a/src/arch/x86/cpuid.cc b/src/arch/x86/cpuid.cc index ac4709ce0e..2ce763e0af 100644 --- a/src/arch/x86/cpuid.cc +++ b/src/arch/x86/cpuid.cc @@ -31,162 +31,105 @@ #include "arch/x86/isa.hh" #include "base/bitfield.hh" #include "cpu/thread_context.hh" +#include "debug/X86.hh" namespace gem5 { -namespace X86ISA { - enum StandardCpuidFunction - { - VendorAndLargestStdFunc, - FamilyModelStepping, - CacheAndTLB, - SerialNumber, - CacheParams, - MonitorMwait, - ThermalPowerMgmt, - ExtendedFeatures, - NumStandardCpuidFuncs - }; +namespace X86ISA +{ - enum ExtendedCpuidFunctions - { - VendorAndLargestExtFunc, - FamilyModelSteppingBrandFeatures, - NameString1, - NameString2, - NameString3, - L1CacheAndTLB, - L2L3CacheAndL2TLB, - APMInfo, - LongModeAddressSize, +X86CPUID::X86CPUID(const std::string& vendor, const std::string& name) + : vendorString(vendor), nameString(name) +{ + fatal_if(vendorString.size() != 12, + "CPUID vendor string must be 12 characters\n"); +} - /* - * The following are defined by the spec but not yet implemented - */ -/* // Function 9 is reserved - SVMInfo = 10, - // Functions 11-24 are reserved - TLB1GBPageInfo = 25, - PerformanceInfo,*/ +void +X86CPUID::addStandardFunc(uint32_t func, std::vector values) +{ + capabilities[func] = values; +} - NumExtendedCpuidFuncs - }; +void +X86CPUID::addExtendedFunc(uint32_t func, std::vector values) +{ + // Extended functions begin with 8000_0000h, but the enum is based from + // zero, so we need to add that to the function value. + capabilities[func | 0x80000000] = values; +} - static const int nameStringSize = 48; - static const char nameString[nameStringSize] = "Fake M5 x86_64 CPU"; +bool +X86CPUID::doCpuid(ThreadContext * tc, uint32_t function, uint32_t index, + CpuidResult &result) +{ + constexpr uint32_t ext = 0x80000000; - uint64_t - stringToRegister(const char *str) - { - uint64_t reg = 0; - for (int pos = 3; pos >=0; pos--) { - reg <<= 8; - reg |= str[pos]; - } - return reg; - } + DPRINTF(X86, "Calling CPUID function %x with index %d\n", function, index); - bool - doCpuid(ThreadContext * tc, uint32_t function, - uint32_t index, CpuidResult &result) - { - uint16_t family = bits(function, 31, 16); - uint16_t funcNum = bits(function, 15, 0); - if (family == 0x8000) { - // The extended functions - switch (funcNum) { - case VendorAndLargestExtFunc: - { - ISA *isa = dynamic_cast(tc->getIsaPtr()); - auto vendor_string = isa->getVendorString(); - result = CpuidResult( - 0x80000000 + NumExtendedCpuidFuncs - 1, - stringToRegister(vendor_string.c_str()), - stringToRegister(vendor_string.c_str() + 4), - stringToRegister(vendor_string.c_str() + 8)); - } - break; - case FamilyModelSteppingBrandFeatures: - result = CpuidResult(0x00020f51, 0x00000405, - 0xebd3fbff, 0x00020001); - break; - case NameString1: - case NameString2: - case NameString3: - { - // Zero fill anything beyond the end of the string. This - // should go away once the string is a vetted parameter. - char cleanName[nameStringSize]; - memset(cleanName, '\0', nameStringSize); - strncpy(cleanName, nameString, nameStringSize); + // Handle the string-related CPUID functions specially + if (function == VendorAndLargestStdFunc) { + result = CpuidResult(NumStandardCpuidFuncs - 1, + stringToRegister(vendorString.c_str()), + stringToRegister(vendorString.c_str() + 4), + stringToRegister(vendorString.c_str() + 8)); - int offset = (funcNum - NameString1) * 16; - assert(nameStringSize >= offset + 16); - result = CpuidResult( - stringToRegister(cleanName + offset + 0), - stringToRegister(cleanName + offset + 4), - stringToRegister(cleanName + offset + 12), - stringToRegister(cleanName + offset + 8)); - } - break; - case L1CacheAndTLB: - result = CpuidResult(0xff08ff08, 0xff20ff20, - 0x40020140, 0x40020140); - break; - case L2L3CacheAndL2TLB: - result = CpuidResult(0x00000000, 0x42004200, - 0x00000000, 0x04008140); - break; - case APMInfo: - result = CpuidResult(0x80000018, 0x68747541, - 0x69746e65, 0x444d4163); - break; - case LongModeAddressSize: - result = CpuidResult(0x00003030, 0x00000000, - 0x00000000, 0x00000000); - break; -/* case SVMInfo: - case TLB1GBPageInfo: - case PerformanceInfo:*/ - default: - warn("x86 cpuid family 0x8000: unimplemented function %u", - funcNum); - return false; - } - } else if (family == 0x0000) { - // The standard functions - switch (funcNum) { - case VendorAndLargestStdFunc: - { - ISA *isa = dynamic_cast(tc->getIsaPtr()); - auto vendor_string = isa->getVendorString(); - result = CpuidResult( - NumStandardCpuidFuncs - 1, - stringToRegister(vendor_string.c_str()), - stringToRegister(vendor_string.c_str() + 4), - stringToRegister(vendor_string.c_str() + 8)); - } - break; - case FamilyModelStepping: - result = CpuidResult(0x00020f51, 0x00000805, - 0xefdbfbff, 0x00000209); - break; - case ExtendedFeatures: - result = CpuidResult(0x00000000, 0x01800000, - 0x00000000, 0x00000000); - break; - default: - warn("x86 cpuid family 0x0000: unimplemented function %u", - funcNum); - return false; - } - } else { - warn("x86 cpuid: unknown family %#x", family); - return false; - } + return true; + } else if (function == (ext | VendorAndLargestExtFunc)) { + result = CpuidResult(0x80000000 + NumExtendedCpuidFuncs - 1, + stringToRegister(vendorString.c_str()), + stringToRegister(vendorString.c_str() + 4), + stringToRegister(vendorString.c_str() + 8)); + + return true; + } else if ((function == (ext | NameString1)) || + (function == (ext | NameString2)) || + (function == (ext | NameString3))) { + // Zero fill anything beyond the end of the string. This + // should go away once the string is a vetted parameter. + char cleanName[nameStringSize]; + memset(cleanName, '\0', nameStringSize); + strncpy(cleanName, nameString.c_str(), nameStringSize-1); + + int funcNum = bits(function, 15, 0); + int offset = (funcNum - NameString1) * 16; + assert(nameStringSize >= offset + 16); + result = CpuidResult( + stringToRegister(cleanName + offset + 0), + stringToRegister(cleanName + offset + 4), + stringToRegister(cleanName + offset + 12), + stringToRegister(cleanName + offset + 8)); return true; } + + // Ignore anything not in the map of supported CPUID functions. + // This is checked after the string-related functions as those are not + // in the capabilities map. + if (!capabilities.count(function)) { + return false; + } + + auto &cap_vec = capabilities[function]; + result = CpuidResult(cap_vec[0], cap_vec[1], + cap_vec[2], cap_vec[3]); + DPRINTF(X86, "CPUID function %x returning (%x, %x, %x, %x)\n", + function, result.rax, result.rbx, result.rdx, result.rcx); + + return true; +} + +uint64_t +X86CPUID::stringToRegister(const char *str) +{ + uint64_t reg = 0; + for (int pos = 3; pos >=0; pos--) { + reg <<= 8; + reg |= str[pos]; + } + return reg; +} + } // namespace X86ISA } // namespace gem5 diff --git a/src/arch/x86/cpuid.hh b/src/arch/x86/cpuid.hh index 5c1a8ccb16..a48d99907d 100644 --- a/src/arch/x86/cpuid.hh +++ b/src/arch/x86/cpuid.hh @@ -29,7 +29,10 @@ #ifndef __ARCH_X86_CPUID_HH__ #define __ARCH_X86_CPUID_HH__ +#include + #include "base/types.hh" +#include "params/X86ISA.hh" namespace gem5 { @@ -38,28 +41,72 @@ class ThreadContext; namespace X86ISA { - struct CpuidResult - { - uint64_t rax; - uint64_t rbx; - uint64_t rcx; - uint64_t rdx; - // These are not in alphebetical order on purpose. The order reflects - // how the CPUID orders the registers when it returns results. - CpuidResult(uint64_t _rax, uint64_t _rbx, - uint64_t _rdx, uint64_t _rcx) : - rax(_rax), rbx(_rbx), rcx(_rcx), rdx(_rdx) - {} +enum StandardCpuidFunction +{ + VendorAndLargestStdFunc, + FamilyModelStepping, + CacheAndTLB, + SerialNumber, + CacheParams, + MonitorMwait, + ThermalPowerMgmt, + ExtendedFeatures, + NumStandardCpuidFuncs +}; - CpuidResult() - {} - }; +enum ExtendedCpuidFunctions +{ + VendorAndLargestExtFunc, + FamilyModelSteppingBrandFeatures, + NameString1, + NameString2, + NameString3, + L1CacheAndTLB, + L2L3CacheAndL2TLB, + APMInfo, + LongModeAddressSize, + NumExtendedCpuidFuncs +}; - uint64_t stringToRegister(const char *str); +constexpr int nameStringSize = 48; + +struct CpuidResult +{ + uint64_t rax; + uint64_t rbx; + uint64_t rcx; + uint64_t rdx; + + // These are not in alphebetical order on purpose. The order reflects + // how the CPUID orders the registers when it returns results. + CpuidResult(uint64_t _rax, uint64_t _rbx, + uint64_t _rdx, uint64_t _rcx) : + rax(_rax), rbx(_rbx), rcx(_rcx), rdx(_rdx) + {} + + CpuidResult() + {} +}; + +class X86CPUID +{ + public: + X86CPUID(const std::string& vendor, const std::string& name); + + void addStandardFunc(uint32_t func, std::vector values); + void addExtendedFunc(uint32_t func, std::vector values); bool doCpuid(ThreadContext * tc, uint32_t function, - uint32_t index, CpuidResult &result); + uint32_t index, CpuidResult &result); + + private: + const std::string vendorString; + const std::string nameString; + std::unordered_map> capabilities; + + uint64_t stringToRegister(const char *str); +}; } // namespace X86ISA } // namespace gem5 diff --git a/src/arch/x86/isa.cc b/src/arch/x86/isa.cc index 31efae3a43..cf1ff9f593 100644 --- a/src/arch/x86/isa.cc +++ b/src/arch/x86/isa.cc @@ -151,10 +151,19 @@ RegClass matRegClass(MatRegClass, MatRegClassName, 1, debug::MatRegs); } // anonymous namespace -ISA::ISA(const X86ISAParams &p) : BaseISA(p), vendorString(p.vendor_string) +ISA::ISA(const X86ISAParams &p) + : BaseISA(p), cpuid(new X86CPUID(p.vendor_string, p.name_string)) { - fatal_if(vendorString.size() != 12, - "CPUID vendor string must be 12 characters\n"); + cpuid->addStandardFunc(FamilyModelStepping, p.FamilyModelStepping); + cpuid->addStandardFunc(CacheParams, p.CacheParams); + cpuid->addStandardFunc(ExtendedFeatures, p.ExtendedFeatures); + + cpuid->addExtendedFunc(FamilyModelSteppingBrandFeatures, + p.FamilyModelSteppingBrandFeatures); + cpuid->addExtendedFunc(L1CacheAndTLB, p.L1CacheAndTLB); + cpuid->addExtendedFunc(L2L3CacheAndL2TLB, p.L2L3CacheAndL2TLB); + cpuid->addExtendedFunc(APMInfo, p.APMInfo); + cpuid->addExtendedFunc(LongModeAddressSize, p.LongModeAddressSize); _regClasses.push_back(&flatIntRegClass); _regClasses.push_back(&flatFloatRegClass); diff --git a/src/arch/x86/isa.hh b/src/arch/x86/isa.hh index f7ae210f96..9c6dcf0921 100644 --- a/src/arch/x86/isa.hh +++ b/src/arch/x86/isa.hh @@ -33,6 +33,7 @@ #include #include "arch/generic/isa.hh" +#include "arch/x86/cpuid.hh" #include "arch/x86/pcstate.hh" #include "arch/x86/regs/ccr.hh" #include "arch/x86/regs/float.hh" @@ -93,6 +94,8 @@ class ISA : public BaseISA void setThreadContext(ThreadContext *_tc) override; std::string getVendorString() const; + + std::unique_ptr cpuid; }; } // namespace X86ISA diff --git a/src/arch/x86/isa/decoder/two_byte_opcodes.isa b/src/arch/x86/isa/decoder/two_byte_opcodes.isa index 38937cb3e2..dac5706a06 100644 --- a/src/arch/x86/isa/decoder/two_byte_opcodes.isa +++ b/src/arch/x86/isa/decoder/two_byte_opcodes.isa @@ -690,8 +690,9 @@ } 0x2: CPUIDInst::CPUID({{ CpuidResult result; - bool success = doCpuid(xc->tcBase(), bits(Rax, 31, 0), - bits(Rcx, 31, 0), result); + ISA *isa = dynamic_cast(xc->tcBase()->getIsaPtr()); + bool success = isa->cpuid->doCpuid(xc->tcBase(), + bits(Rax, 31, 0), bits(Rcx, 31, 0), result); if (success) { Rax = result.rax; Rbx = result.rbx; diff --git a/src/arch/x86/isa/includes.isa b/src/arch/x86/isa/includes.isa index 6fc5f448a0..9445f2032b 100644 --- a/src/arch/x86/isa/includes.isa +++ b/src/arch/x86/isa/includes.isa @@ -63,6 +63,7 @@ output header {{ #include "arch/x86/insts/microregop.hh" #include "arch/x86/insts/microspecop.hh" #include "arch/x86/insts/static_inst.hh" +#include "arch/x86/isa.hh" #include "arch/x86/regs/ccr.hh" #include "arch/x86/regs/int.hh" #include "arch/x86/regs/misc.hh" diff --git a/src/arch/x86/kvm/x86_cpu.cc b/src/arch/x86/kvm/x86_cpu.cc index 7faa9159ab..fdb557af88 100644 --- a/src/arch/x86/kvm/x86_cpu.cc +++ b/src/arch/x86/kvm/x86_cpu.cc @@ -37,6 +37,7 @@ #include "arch/x86/cpuid.hh" #include "arch/x86/faults.hh" #include "arch/x86/interrupts.hh" +#include "arch/x86/isa.hh" #include "arch/x86/regs/float.hh" #include "arch/x86/regs/int.hh" #include "arch/x86/regs/msr.hh" @@ -1443,26 +1444,27 @@ X86KvmCPU::updateCPUID() * currently not a problem since M5 doesn't expose any of them at * the moment. */ + X86ISA::ISA *isa = dynamic_cast(tc->getIsaPtr()); /* Basic features */ CpuidResult func0; - X86ISA::doCpuid(tc, 0x0, 0, func0); + isa->cpuid->doCpuid(tc, 0x0, 0, func0); for (uint32_t function = 0; function <= func0.rax; ++function) { CpuidResult cpuid; uint32_t idx(0); - X86ISA::doCpuid(tc, function, idx, cpuid); + isa->cpuid->doCpuid(tc, function, idx, cpuid); m5_supported.push_back(makeKvmCpuid(function, idx, cpuid)); } /* Extended features */ CpuidResult efunc0; - X86ISA::doCpuid(tc, 0x80000000, 0, efunc0); + isa->cpuid->doCpuid(tc, 0x80000000, 0, efunc0); for (uint32_t function = 0x80000000; function <= efunc0.rax; ++function) { CpuidResult cpuid; uint32_t idx(0); - X86ISA::doCpuid(tc, function, idx, cpuid); + isa->cpuid->doCpuid(tc, function, idx, cpuid); m5_supported.push_back(makeKvmCpuid(function, idx, cpuid)); } From 3946f7ba2c825afbd2c995201d5a9e4e2fe0f11c Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Fri, 21 Jul 2023 14:08:18 -0500 Subject: [PATCH 2/5] arch-x86: Support CPUID functions with indexes Various CPUID functions will return different values depending on the value of ECX when executing the CPUID instruction. Add support for this in the X86 KVM CPU. A subsequent patch will add a CPUID function which requires iterating through multiple ECX values. Change-Id: Ib44a52be52ea632d5e2cee3fb2ca390b60a7202a --- src/arch/x86/cpuid.cc | 22 ++++++++++-- src/arch/x86/cpuid.hh | 1 + src/arch/x86/kvm/x86_cpu.cc | 71 ++++++++++++++++++++++++++++++------- 3 files changed, 80 insertions(+), 14 deletions(-) diff --git a/src/arch/x86/cpuid.cc b/src/arch/x86/cpuid.cc index 2ce763e0af..75e69735c1 100644 --- a/src/arch/x86/cpuid.cc +++ b/src/arch/x86/cpuid.cc @@ -111,9 +111,19 @@ X86CPUID::doCpuid(ThreadContext * tc, uint32_t function, uint32_t index, return false; } + int cap_offset = 0; + + // Ignore index values for functions that do not take index values. + if (hasSignificantIndex(function)) { + cap_offset = index * 4; + } + + // Ensure we have the offset and 4 dwords after it. + assert(capabilities[function].size() >= (cap_offset + 4)); + auto &cap_vec = capabilities[function]; - result = CpuidResult(cap_vec[0], cap_vec[1], - cap_vec[2], cap_vec[3]); + result = CpuidResult(cap_vec[cap_offset + 0], cap_vec[cap_offset + 1], + cap_vec[cap_offset + 2], cap_vec[cap_offset + 3]); DPRINTF(X86, "CPUID function %x returning (%x, %x, %x, %x)\n", function, result.rax, result.rbx, result.rdx, result.rcx); @@ -131,5 +141,13 @@ X86CPUID::stringToRegister(const char *str) return reg; } +// Return true if the CPUID function takes ECX index as an input AND +// those multiple index values are supported in gem5. +bool +X86CPUID::hasSignificantIndex(uint32_t function) +{ + return false; +} + } // namespace X86ISA } // namespace gem5 diff --git a/src/arch/x86/cpuid.hh b/src/arch/x86/cpuid.hh index a48d99907d..71e8d3c626 100644 --- a/src/arch/x86/cpuid.hh +++ b/src/arch/x86/cpuid.hh @@ -99,6 +99,7 @@ class X86CPUID bool doCpuid(ThreadContext * tc, uint32_t function, uint32_t index, CpuidResult &result); + bool hasSignificantIndex(uint32_t function); private: const std::string vendorString; diff --git a/src/arch/x86/kvm/x86_cpu.cc b/src/arch/x86/kvm/x86_cpu.cc index fdb557af88..e1c1b0dfc0 100644 --- a/src/arch/x86/kvm/x86_cpu.cc +++ b/src/arch/x86/kvm/x86_cpu.cc @@ -74,6 +74,13 @@ using namespace X86ISA; // data) is used to indicate that a segment has been accessed. #define SEG_TYPE_BIT_ACCESSED 1 +// Some linux distro s(e.g., RHEL7) define the KVM macros using "BIT" but do +// not include where BIT is defined, so define it here in that case. +#ifndef BIT +#define BIT(nr) (1UL << (nr)) +#endif + + struct GEM5_PACKED FXSave { uint16_t fcw; @@ -1420,12 +1427,12 @@ X86KvmCPU::ioctlRun() static struct kvm_cpuid_entry2 makeKvmCpuid(uint32_t function, uint32_t index, - CpuidResult &result) + CpuidResult &result, uint32_t flags = 0) { struct kvm_cpuid_entry2 e; e.function = function; e.index = index; - e.flags = 0; + e.flags = flags; e.eax = (uint32_t)result.rax; e.ebx = (uint32_t)result.rbx; e.ecx = (uint32_t)result.rcx; @@ -1438,12 +1445,6 @@ void X86KvmCPU::updateCPUID() { Kvm::CPUIDVector m5_supported; - - /* TODO: We currently don't support any of the functions that - * iterate through data structures in the CPU using an index. It's - * currently not a problem since M5 doesn't expose any of them at - * the moment. - */ X86ISA::ISA *isa = dynamic_cast(tc->getIsaPtr()); /* Basic features */ @@ -1453,8 +1454,31 @@ X86KvmCPU::updateCPUID() CpuidResult cpuid; uint32_t idx(0); - isa->cpuid->doCpuid(tc, function, idx, cpuid); - m5_supported.push_back(makeKvmCpuid(function, idx, cpuid)); + if (!isa->cpuid->hasSignificantIndex(function)) { + isa->cpuid->doCpuid(tc, function, idx, cpuid); + m5_supported.push_back(makeKvmCpuid(function, idx, cpuid)); + } else { + while (true) { + bool rv = isa->cpuid->doCpuid(tc, function, idx, cpuid); + assert(rv); + + if (idx && + !cpuid.rax && !cpuid.rbx && !cpuid.rdx && !cpuid.rcx) { + break; + } + + /* + * For functions in family 0, this flag tells Linux to compare + * the index as well as the function number rather than only + * the function number. Important: Do NOT set this flag if the + * function does not take an index. Doing so will break SMP. + */ + uint32_t flag = KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + m5_supported.push_back( + makeKvmCpuid(function, idx, cpuid, flag)); + idx++; + } + } } /* Extended features */ @@ -1464,8 +1488,31 @@ X86KvmCPU::updateCPUID() CpuidResult cpuid; uint32_t idx(0); - isa->cpuid->doCpuid(tc, function, idx, cpuid); - m5_supported.push_back(makeKvmCpuid(function, idx, cpuid)); + if (!isa->cpuid->hasSignificantIndex(function)) { + isa->cpuid->doCpuid(tc, function, idx, cpuid); + m5_supported.push_back(makeKvmCpuid(function, idx, cpuid)); + } else { + while (true) { + bool rv = isa->cpuid->doCpuid(tc, function, idx, cpuid); + assert(rv); + + if (idx && + !cpuid.rax && !cpuid.rbx && !cpuid.rdx && !cpuid.rcx) { + break; + } + + /* + * For functions in family 0, this flag tells Linux to compare + * the index as well as the function number rather than only + * the function number. Important: Do NOT set this flag if the + * function does not take an index. Doing so will break SMP. + */ + uint32_t flag = KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + m5_supported.push_back( + makeKvmCpuid(function, idx, cpuid, flag)); + idx++; + } + } } setCPUID(m5_supported); From 3584c3126cf5ecc7cc9df8011fb7ca19e2194243 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Fri, 21 Jul 2023 16:17:54 -0500 Subject: [PATCH 3/5] arch-x86: Expose CR4.osxsave bit Related to the recent changes with moving CPUID values to python, this value is needed to enable AVX and needs a way to be exposed to python as well in order to set the bit and the corresponding CPUID values at the same time. Change-Id: I3cadb0fe61ff4ebf6de903018a8d8a411bfdb4e0 --- src/arch/x86/X86FsWorkload.py | 1 + src/arch/x86/fs_workload.cc | 4 +++- src/arch/x86/fs_workload.hh | 3 +++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/arch/x86/X86FsWorkload.py b/src/arch/x86/X86FsWorkload.py index 294241b51c..277a37988e 100644 --- a/src/arch/x86/X86FsWorkload.py +++ b/src/arch/x86/X86FsWorkload.py @@ -65,6 +65,7 @@ class X86FsWorkload(KernelWorkload): acpi_description_table_pointer = Param.X86ACPIRSDP( X86ACPIRSDP(), "ACPI root description pointer structure" ) + enable_osxsave = Param.Bool(False, "Enable OSXSAVE in CR4 register") class X86FsLinux(X86FsWorkload): diff --git a/src/arch/x86/fs_workload.cc b/src/arch/x86/fs_workload.cc index 1a412380a6..88d7deed68 100644 --- a/src/arch/x86/fs_workload.cc +++ b/src/arch/x86/fs_workload.cc @@ -58,7 +58,8 @@ FsWorkload::FsWorkload(const Params &p) : KernelWorkload(p), smbiosTable(p.smbios_table), mpFloatingPointer(p.intel_mp_pointer), mpConfigTable(p.intel_mp_table), - rsdp(p.acpi_description_table_pointer) + rsdp(p.acpi_description_table_pointer), + enable_osxsave(p.enable_osxsave) {} void @@ -295,6 +296,7 @@ FsWorkload::initState() CR4 cr4 = tc->readMiscRegNoEffect(misc_reg::Cr4); // Turn on pae. cr4.pae = 1; + cr4.osxsave = enable_osxsave; tc->setMiscReg(misc_reg::Cr4, cr4); // Point to the page tables. diff --git a/src/arch/x86/fs_workload.hh b/src/arch/x86/fs_workload.hh index 9d14f91bb5..81db414fb2 100644 --- a/src/arch/x86/fs_workload.hh +++ b/src/arch/x86/fs_workload.hh @@ -106,6 +106,9 @@ class FsWorkload : public KernelWorkload Addr &fpSize, Addr &tableSize, Addr table=0); void writeOutACPITables(Addr begin, Addr &size); + + private: + bool enable_osxsave; }; } // namespace X86ISA From 7c3c2b05f38f20ddd5be5e5f2d5828c36ba95528 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Thu, 27 Jul 2023 10:44:34 -0500 Subject: [PATCH 4/5] arch-x86: Add extended state CPUID function The extended state CPUID function is used to set the values of the XCR0 register as well as specify the size of storage for context switching storage for x87 and AVX+. This function is iterative and therefore requires (1) marking it as such in the hsaSignificantIndex function (2) setting multiple sets of 4-tuples for the default CPUID values where the last 4-tuple ends with all zeros. Change-Id: Ib6a43925afb1cae75f61d8acff52a3cc26ce17c8 --- src/arch/x86/X86ISA.py | 14 ++++++++++++++ src/arch/x86/cpuid.cc | 12 ++++++++++++ src/arch/x86/cpuid.hh | 1 + src/arch/x86/isa.cc | 1 + 4 files changed, 28 insertions(+) diff --git a/src/arch/x86/X86ISA.py b/src/arch/x86/X86ISA.py index aa5c29a98e..aa48d1aa6e 100644 --- a/src/arch/x86/X86ISA.py +++ b/src/arch/x86/X86ISA.py @@ -85,6 +85,20 @@ class X86ISA(BaseISA): ExtendedFeatures = VectorParam.UInt32( [0x00000000, 0x01800000, 0x00000000, 0x00000000], "feature flags" ) + # 0000_000Dh - This uses ECX index, so the last entry must be all zeros + ExtendedState = VectorParam.UInt32( + [ + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + ], + "extended state enumeration", + ) # 8000_0001h FamilyModelSteppingBrandFeatures = VectorParam.UInt32( [0x00020F51, 0x00000405, 0xEBD3FBFF, 0x00020001], diff --git a/src/arch/x86/cpuid.cc b/src/arch/x86/cpuid.cc index 75e69735c1..2ce9ec9289 100644 --- a/src/arch/x86/cpuid.cc +++ b/src/arch/x86/cpuid.cc @@ -146,6 +146,18 @@ X86CPUID::stringToRegister(const char *str) bool X86CPUID::hasSignificantIndex(uint32_t function) { + uint16_t family = bits(function, 31, 16); + uint16_t funcNum = bits(function, 15, 0); + + if (family == 0x0000) { + switch (funcNum) { + case ExtendedState: + return true; + default: + return false; + } + } + return false; } diff --git a/src/arch/x86/cpuid.hh b/src/arch/x86/cpuid.hh index 71e8d3c626..1c932980d2 100644 --- a/src/arch/x86/cpuid.hh +++ b/src/arch/x86/cpuid.hh @@ -52,6 +52,7 @@ enum StandardCpuidFunction MonitorMwait, ThermalPowerMgmt, ExtendedFeatures, + ExtendedState = 0xD, NumStandardCpuidFuncs }; diff --git a/src/arch/x86/isa.cc b/src/arch/x86/isa.cc index cf1ff9f593..9e6082a268 100644 --- a/src/arch/x86/isa.cc +++ b/src/arch/x86/isa.cc @@ -157,6 +157,7 @@ ISA::ISA(const X86ISAParams &p) cpuid->addStandardFunc(FamilyModelStepping, p.FamilyModelStepping); cpuid->addStandardFunc(CacheParams, p.CacheParams); cpuid->addStandardFunc(ExtendedFeatures, p.ExtendedFeatures); + cpuid->addStandardFunc(ExtendedState, p.ExtendedState); cpuid->addExtendedFunc(FamilyModelSteppingBrandFeatures, p.FamilyModelSteppingBrandFeatures); From 9acfc5a751c4df408b2fa9f73aa4512392c48c02 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Fri, 21 Jul 2023 16:25:03 -0500 Subject: [PATCH 5/5] configs: Enable AVX2 for GPUFS+KVM AVX is a requirement for some ROCm libraries, such as rocBLAS, which are themselves requirements for libraries higher up the stack like PyTorch. This patch sets the necessary CPUID bits in the GPUFS config to enable AVX, AVX2, and various SSE features so that applications using these libraries do not cause an illegal instruction trap. Change-Id: Id22f543fb2a06b268271725a54075ee6a9a1f041 --- configs/example/gpufs/system/system.py | 39 ++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/configs/example/gpufs/system/system.py b/configs/example/gpufs/system/system.py index 19df310295..7ddc4f0752 100644 --- a/configs/example/gpufs/system/system.py +++ b/configs/example/gpufs/system/system.py @@ -231,6 +231,42 @@ def makeGpuFSSystem(args): clock=args.ruby_clock, voltage_domain=system.voltage_domain ) + # If we are using KVM cpu, enable AVX. AVX is used in some ROCm libraries + # such as rocBLAS which is used in higher level libraries like PyTorch. + use_avx = False + if ObjectList.is_kvm_cpu(TestCPUClass): + # AVX also requires CR4.osxsave to be 1. These must be set together + # of KVM will error out. + system.workload.enable_osxsave = 1 + use_avx = True + + # These values are taken from a real CPU and are further explained here: + # https://sandpile.org/x86/cpuid.htm#level_0000_000Dh + avx_extended_state = [ + 0x00000007, + 0x00000340, + 0x00000000, + 0x00000340, + 0x0000000F, + 0x00000340, + 0x00000000, + 0x00000000, + 0x00000100, + 0x00000240, + 0x00000000, + 0x00000040, + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + ] + + # This modifies the default value for ECX only (4th in this array). + # See: https://sandpile.org/x86/cpuid.htm#level_0000_0001h + # Enables AVX, OSXSAVE, XSAVE, POPCNT, SSE4.2, SSE4.1, CMPXCHG16B, + # and FMA. + avx_cpu_features = [0x00020F51, 0x00000805, 0xEFDBFBFF, 0x1C983209] + for (i, cpu) in enumerate(system.cpu): # Break once we reach the shader "CPU" if i == args.num_cpus: @@ -247,6 +283,9 @@ def makeGpuFSSystem(args): for j in range(len(system.cpu[i].isa)): system.cpu[i].isa[j].vendor_string = "AuthenticAMD" + if use_avx: + system.cpu[i].isa[j].ExtendedState = avx_extended_state + system.cpu[i].isa[j].FamilyModelStepping = avx_cpu_features if args.host_parallel: # To get the KVM CPUs to run on different host CPUs, specify a