From 35ccd7f90764f7d4db59efd66bbbe4c1e48496d7 Mon Sep 17 00:00:00 2001 From: Nitesh Narayana Date: Fri, 24 Nov 2023 15:20:30 +0100 Subject: [PATCH 001/521] arch-arm: This commit adds the mla/s indexed versions This includes the isa and instruction implementations of mla and mls indexed versions from ARM SVE2 ISA spec. Change-Id: I4fbd0382f23d8611e46411f74dc991f5a211a313 --- src/arch/arm/insts/sve.cc | 20 ++++++++ src/arch/arm/insts/sve.hh | 21 ++++++++ src/arch/arm/isa/formats/sve_2nd_level.isa | 58 ++++++++++++++++++++++ src/arch/arm/isa/insts/sve.isa | 37 ++++++++++++++ src/arch/arm/isa/templates/sve.isa | 40 +++++++++++++++ 5 files changed, 176 insertions(+) diff --git a/src/arch/arm/insts/sve.cc b/src/arch/arm/insts/sve.cc index b0512817a8..240a7fb116 100644 --- a/src/arch/arm/insts/sve.cc +++ b/src/arch/arm/insts/sve.cc @@ -435,6 +435,26 @@ SveTerPredOp::generateDisassembly( return ss.str(); } + +std::string +SveTerIndexedOp::generateDisassembly( + Addr pc, const loader::SymbolTable *symtab) const +{ + std::stringstream ss; + printMnemonic(ss, "", false); + printVecReg(ss, dest, true); + ccprintf(ss, ", "); + printVecReg(ss, op1, true); + ccprintf(ss, ", "); + printVecReg(ss, op2, true); + ccprintf(ss, "["); + ccprintf(ss, "%lu", imm); + ccprintf(ss, "]"); + return ss.str(); +} + + + std::string SveTerUnpredOp::generateDisassembly( Addr pc, const loader::SymbolTable *symtab) const diff --git a/src/arch/arm/insts/sve.hh b/src/arch/arm/insts/sve.hh index dc18ff30a7..c23a975c8b 100644 --- a/src/arch/arm/insts/sve.hh +++ b/src/arch/arm/insts/sve.hh @@ -498,6 +498,27 @@ class SveTerPredOp : public ArmStaticInst Addr pc, const loader::SymbolTable *symtab) const override; }; +/// Ternary, destructive, unpredicated , !INDEXED! SVE Instruction +class SveTerIndexedOp : public ArmStaticInst +{ + protected: + RegIndex dest, op1, op2; + uint16_t imm; + uint8_t esize; + + SveTerIndexedOp(const char* mnem, ExtMachInst _machInst, + OpClass __opClass, RegIndex _dest, + RegIndex _op1, RegIndex _op2, uint16_t _imm) : + ArmStaticInst(mnem, _machInst, __opClass), + dest(_dest), op1(_op1), op2(_op2) , imm(_imm) + {} + + std::string generateDisassembly( + Addr pc, const loader::SymbolTable *symtab) const override; +}; + + + /// Ternary, destructive, unpredicated SVE instruction. class SveTerUnpredOp : public ArmStaticInst { diff --git a/src/arch/arm/isa/formats/sve_2nd_level.isa b/src/arch/arm/isa/formats/sve_2nd_level.isa index 86c174d7c4..83c811cafd 100644 --- a/src/arch/arm/isa/formats/sve_2nd_level.isa +++ b/src/arch/arm/isa/formats/sve_2nd_level.isa @@ -245,6 +245,59 @@ namespace Aarch64 return new Unknown64(machInst); } // decodeSveIntMulAdd + StaticInstPtr + decodeSveMultiplyAccIndexed(ExtMachInst machInst) + { + RegIndex zda = (RegIndex) (uint8_t) bits(machInst, 4, 0); + RegIndex zn = (RegIndex) (uint8_t) bits(machInst, 9, 5); + RegIndex zm ; + + //= (RegIndex) (uint8_t) bits(machInst, 19, 16); + + uint8_t size = bits(machInst, 23, 22); + uint16_t imm; + uint8_t opc = (bits(machInst, 10)); + + switch(size) { + case 0b00: + case 0b01: + zm = (RegIndex)(uint8_t)bits(machInst, 18, 16); + imm = (uint16_t)(bits(machInst, 22) << 2) + | bits(machInst, 20, 19); + switch(opc) { + case 0x0: return new Sve2Mlai( + machInst, zda, zn, zm, imm); + case 0x1: return new Sve2Mlsi( + machInst, zda, zn, zm, imm); + } + break; + + case 0b10: + zm = (RegIndex)(uint8_t)bits(machInst, 18, 16); + imm = (uint16_t)bits(machInst, 20, 19); + switch(opc) { + case 0x0: return new Sve2Mlai( + machInst, zda, zn, zm, imm); + case 0x1: return new Sve2Mlsi( + machInst, zda, zn, zm, imm); + } + break; + + case 0b11: + zm = (RegIndex)(uint8_t)bits(machInst, 19, 16); + imm = (uint16_t)bits(machInst, 20); + switch(opc) { + case 0x0: return new Sve2Mlai( + machInst, zda, zn, zm, imm); + case 0x1: return new Sve2Mlsi( + machInst, zda, zn, zm, imm); + } + break; + } + return new Unknown64(machInst); + + } // decodeSveMultiplyAccIndexed + StaticInstPtr decodeSveIntMatMulAdd(ExtMachInst machInst) { @@ -3920,6 +3973,11 @@ namespace Aarch64 return decodeSveIntegerDotProductIndexed(machInst); case 0b11: return decodeSveMixedSignDotProductIndexed(machInst); + + // for mla/s indexed , can be renamed + case 0b01: + return decodeSveMultiplyAccIndexed(machInst); + default: return new Unknown64(machInst); } diff --git a/src/arch/arm/isa/insts/sve.isa b/src/arch/arm/isa/insts/sve.isa index 9999843b59..e222f97f6f 100644 --- a/src/arch/arm/isa/insts/sve.isa +++ b/src/arch/arm/isa/insts/sve.isa @@ -2096,6 +2096,34 @@ let {{ 'class_name' : 'Sve' + Name} exec_output += SveOpExecDeclare.subst(substDict) + # Generates definitions for ternary SVE instructions (indexed) + def sveTerInstIndexed(name, Name, opClass, types, op, decoder='Generic'): + global header_output, exec_output, decoders + code = sveEnabledCheckCode + ''' + unsigned eCount = ArmStaticInst::getCurSveVecLen( + xc->tcBase()); + for (unsigned i = 0; i < eCount; i++) { + int segbase = i - i % (128 / sizeof(Element)); + int s = segbase + imm; + const Element& srcElem1 = AA64FpOp1_x[i]; + const Element& srcElem2 = AA64FpOp2_x[s]; + Element destElem = AA64FpDestMerge_x[i]; + ''' + + code += f"{op} \n" + + + code += ''' AA64FpDest_x[i] = destElem; + }''' + iop = ArmInstObjParams(name, 'Sve2' + Name+ 'i', 'SveTerIndexedOp', + {'code': code, 'op_class': opClass}, []) + header_output += SveTerIndexedOpDeclare.subst(iop) + exec_output += SveOpExecute.subst(iop) + for type in types: + substDict = {'targs' : type, + 'class_name' : 'Sve2' + Name + "i"} + exec_output += SveOpExecDeclare.subst(substDict) + # Generates definitions for ternary SVE intructions (always predicated - # merging) def sveTerInst(name, Name, opClass, types, op, decoder='Generic'): @@ -3094,6 +3122,9 @@ let {{ substDict = {'targs': type, 'class_name': 'Sve' + Name} exec_output += SveOpExecDeclare.subst(substDict) + + + # Generate definition for DOT instructions def sveDotInst(name, Name, opClass, types, isIndexed = True): global header_output, exec_output, decoders @@ -4244,9 +4275,15 @@ let {{ # MLA mlaCode = 'destElem += srcElem1 * srcElem2;' sveTerInst('mla', 'Mla', 'SimdMultAccOp', signedTypes, mlaCode) + #indexed + sveTerInstIndexed('mla', 'Mla', 'SimdMultAccOp', signedTypes, mlaCode) + # MLS mlsCode = 'destElem -= srcElem1 * srcElem2;' sveTerInst('mls', 'Mls', 'SimdMultAccOp', signedTypes, mlsCode) + #indexed + sveTerInstIndexed('mls', 'Mls', 'SimdMultAccOp', signedTypes, mlsCode) + # ADCLT adcltCode = 'res = srcElem1 + srcElem2 + carryIn;' sveTerInstUnpred('adclt', 'Adclt', 'VectorIntegerArithOp', unsignedTypes, diff --git a/src/arch/arm/isa/templates/sve.isa b/src/arch/arm/isa/templates/sve.isa index 813bda029d..b254b1b43d 100644 --- a/src/arch/arm/isa/templates/sve.isa +++ b/src/arch/arm/isa/templates/sve.isa @@ -515,6 +515,44 @@ class %(class_name)s : public %(base_class)s }; }}; + + +def template SveTerIndexedOpDeclare {{ + /* + For mla indexed version as it is not included in gem5 right now. + Using ternary ops but all ops are vector regs. + index is the imm here. (name can be changed) +*/ +template +class %(class_name)s : public %(base_class)s +{ + //static_assert(sizeof(_SElementA) == sizeof(_SElementB), + // "Source elements must have the same size."); + + private: + %(reg_idx_arr_decl)s; + + protected: + typedef _Element Element; + typedef _Element TPElem; + + public: + // Constructor + %(class_name)s(ExtMachInst machInst, RegIndex _dest, + RegIndex _op1, RegIndex _op2, uint16_t _imm) : + %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, + _dest, _op1, _op2,_imm) + { + %(set_reg_idx_arr)s; + %(constructor)s; + esize = sizeof(Element); + } + + Fault execute(ExecContext *, trace::InstRecord *) const override; +}; +}}; + + def template SveMatMulOpDeclare {{ template class %(class_name)s : public %(base_class)s From e0c5f951103237394e942ad9ebc3182997a77b2a Mon Sep 17 00:00:00 2001 From: "Bobby R. Bruce" Date: Sun, 3 Dec 2023 13:46:55 -0800 Subject: [PATCH 002/521] misc: Merge Weekly GPU tests into Weekly Tests This seperation was only for convenience while GPU tests were under development and rapidly changing. This test merges the GPU tests into the weekly tests where they belong. Change-Id: I0e7118e863dba51334de89b3bbc3592374ef63ec --- .github/workflows/gpu-tests.yaml | 95 ----------------------------- .github/workflows/weekly-tests.yaml | 81 ++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 95 deletions(-) delete mode 100644 .github/workflows/gpu-tests.yaml diff --git a/.github/workflows/gpu-tests.yaml b/.github/workflows/gpu-tests.yaml deleted file mode 100644 index a60218e6e7..0000000000 --- a/.github/workflows/gpu-tests.yaml +++ /dev/null @@ -1,95 +0,0 @@ ---- -# This workflow runs all the Weekly GPU Tests. -# For now this file is kept separate as we are still developing and testing -# this workflow. It will eventually be merged with "weekly-tests.yaml" - -name: Weekly Tests (GPU) - -on: - # Runs every Sunday from 7AM UTC - schedule: - - cron: 00 7 * * 6 - # Allows us to manually start workflow for testing - workflow_dispatch: - -jobs: - build-gem5: - runs-on: [self-hosted, linux, x64] - container: ghcr.io/gem5/gcn-gpu:latest - steps: - - uses: actions/checkout@v3 - with: - # Scheduled workflows run on the default branch by default. We - # therefore need to explicitly checkout the develop branch. - ref: develop - - name: Build gem5 - run: scons build/VEGA_X86/gem5.opt -j $(nproc) --ignore-style - - uses: actions/upload-artifact@v3 - with: - name: weekly-test-${{ github.run_number }}-attempt-${{ github.run_attempt }}-gem5-build-vega - path: build/VEGA_X86/gem5.opt - retention-days: 5 - - run: echo "This job's status is ${{ job.status }}." - - LULESH-tests: - runs-on: [self-hosted, linux, x64] - container: ghcr.io/gem5/gcn-gpu:latest - needs: build-gem5 - timeout-minutes: 480 # 8 hours - steps: - - uses: actions/checkout@v3 - with: - # Scheduled workflows run on the default branch by default. We - # therefore need to explicitly checkout the develop branch. - ref: develop - - - name: Download build/VEGA_X86/gem5.opt - uses: actions/download-artifact@v3 - with: - name: weekly-test-${{ github.run_number }}-attempt-${{ github.run_attempt }}-gem5-build-vega - path: build/VEGA_X86 - # `download-artifact` does not preserve permissions so we need to set - # them again. - - run: chmod u+x build/VEGA_X86/gem5.opt - - - name: Obtain LULESH - # Obtains the latest LULESH compatible with this version of gem5 via - # gem5 Resources. - run: build/VEGA_X86/gem5.opt util/obtain-resource.py lulesh -p lulesh - - - name: Run LULUESH tests - working-directory: ${{ github.workspace }} - run: | - build/VEGA_X86/gem5.opt configs/example/apu_se.py -n3 --mem-size=8GB --reg-alloc-policy=dynamic --dgpu --gfx-version=gfx900 -c \ - lulesh --options="0.01 2" - - HACC-tests: - runs-on: [self-hosted, linux, x64] - container: ghcr.io/gem5/gcn-gpu:latest - needs: build-gem5 - timeout-minutes: 120 # 2 hours - steps: - - uses: actions/checkout@v3 - with: - # Scheduled workflows run on the default branch by default. We - # therefore need to explicitly checkout the develop branch. - ref: develop - - uses: actions/download-artifact@v3 - with: - name: weekly-test-${{ github.run_number }}-attempt-${{ github.run_attempt }}-gem5-build-vega - path: build/VEGA_X86 - - run: chmod u+x build/VEGA_X86/gem5.opt - - name: make hip directory - run: mkdir hip - - name: Compile m5ops and x86 - working-directory: ${{ github.workspace }}/util/m5 - run: | - export TERM=xterm-256color - scons build/x86/out/m5 - - name: Download tests - working-directory: ${{ github.workspace }}/hip - run: wget http://dist.gem5.org/dist/v22-1/test-progs/halo-finder/ForceTreeTest - - name: Run HACC tests - working-directory: ${{ github.workspace }} - run: | - build/VEGA_X86/gem5.opt configs/example/apu_se.py -n3 --reg-alloc-policy=dynamic --benchmark-root=hip -c ForceTreeTest --options="0.5 0.1 64 0.1 1 N 12 rcb" diff --git a/.github/workflows/weekly-tests.yaml b/.github/workflows/weekly-tests.yaml index 6c211435c2..019632a564 100644 --- a/.github/workflows/weekly-tests.yaml +++ b/.github/workflows/weekly-tests.yaml @@ -11,6 +11,87 @@ on: workflow_dispatch: jobs: + build-gcn-gpu-gem5: + runs-on: [self-hosted, linux, x64] + container: ghcr.io/gem5/gcn-gpu:latest + steps: + - uses: actions/checkout@v3 + with: + # Scheduled workflows run on the default branch by default. We + # therefore need to explicitly checkout the develop branch. + ref: develop + - name: Build gem5 + run: scons build/VEGA_X86/gem5.opt -j $(nproc) --ignore-style + - uses: actions/upload-artifact@v3 + with: + name: weekly-test-${{ github.run_number }}-attempt-${{ github.run_attempt }}-gem5-build-vega + path: build/VEGA_X86/gem5.opt + retention-days: 5 + - run: echo "This job's status is ${{ job.status }}." + + LULESH-tests: + runs-on: [self-hosted, linux, x64] + container: ghcr.io/gem5/gcn-gpu:latest + needs: build-gcn-gpu-gem5 + timeout-minutes: 480 # 8 hours + steps: + - uses: actions/checkout@v3 + with: + # Scheduled workflows run on the default branch by default. We + # therefore need to explicitly checkout the develop branch. + ref: develop + + - name: Download build/VEGA_X86/gem5.opt + uses: actions/download-artifact@v3 + with: + name: weekly-test-${{ github.run_number }}-attempt-${{ github.run_attempt }}-gem5-build-vega + path: build/VEGA_X86 + # `download-artifact` does not preserve permissions so we need to set + # them again. + - run: chmod u+x build/VEGA_X86/gem5.opt + + - name: Obtain LULESH + # Obtains the latest LULESH compatible with this version of gem5 via + # gem5 Resources. + run: build/VEGA_X86/gem5.opt util/obtain-resource.py lulesh -p lulesh + + - name: Run LULUESH tests + working-directory: ${{ github.workspace }} + run: | + build/VEGA_X86/gem5.opt configs/example/apu_se.py -n3 --mem-size=8GB --reg-alloc-policy=dynamic --dgpu --gfx-version=gfx900 -c \ + lulesh --options="0.01 2" + + HACC-tests: + runs-on: [self-hosted, linux, x64] + container: ghcr.io/gem5/gcn-gpu:latest + needs: build-gcn-gpu-gem5 + timeout-minutes: 120 # 2 hours + steps: + - uses: actions/checkout@v3 + with: + # Scheduled workflows run on the default branch by default. We + # therefore need to explicitly checkout the develop branch. + ref: develop + - uses: actions/download-artifact@v3 + with: + name: weekly-test-${{ github.run_number }}-attempt-${{ github.run_attempt }}-gem5-build-vega + path: build/VEGA_X86 + - run: chmod u+x build/VEGA_X86/gem5.opt + - name: make hip directory + run: mkdir hip + - name: Compile m5ops and x86 + working-directory: ${{ github.workspace }}/util/m5 + run: | + export TERM=xterm-256color + scons build/x86/out/m5 + - name: Download tests + working-directory: ${{ github.workspace }}/hip + run: wget http://dist.gem5.org/dist/v22-1/test-progs/halo-finder/ForceTreeTest + - name: Run HACC tests + working-directory: ${{ github.workspace }} + run: | + build/VEGA_X86/gem5.opt configs/example/apu_se.py -n3 --reg-alloc-policy=dynamic --benchmark-root=hip -c ForceTreeTest --options="0.5 0.1 64 0.1 1 N 12 rcb" + build-gem5: runs-on: [self-hosted, linux, x64] container: ghcr.io/gem5/ubuntu-22.04_all-dependencies:latest From db8e1652e8f0c39a55fa2c74a83d8637086337d6 Mon Sep 17 00:00:00 2001 From: Nitesh Narayana Date: Tue, 5 Dec 2023 23:40:06 +0100 Subject: [PATCH 003/521] arch-arm: This commit uses existing template code for mla/s index This includes mla/s index version implementation using the existing template code to avoid code repeatition. Change-Id: If1de84e01dec638e206c979ca832308ebc904212 --- src/arch/arm/insts/sve.cc | 20 --------- src/arch/arm/insts/sve.hh | 21 ---------- src/arch/arm/isa/formats/sve_2nd_level.isa | 48 ++++++++++++---------- src/arch/arm/isa/insts/sve.isa | 32 +-------------- src/arch/arm/isa/templates/sve.isa | 34 --------------- 5 files changed, 29 insertions(+), 126 deletions(-) diff --git a/src/arch/arm/insts/sve.cc b/src/arch/arm/insts/sve.cc index 240a7fb116..b0512817a8 100644 --- a/src/arch/arm/insts/sve.cc +++ b/src/arch/arm/insts/sve.cc @@ -435,26 +435,6 @@ SveTerPredOp::generateDisassembly( return ss.str(); } - -std::string -SveTerIndexedOp::generateDisassembly( - Addr pc, const loader::SymbolTable *symtab) const -{ - std::stringstream ss; - printMnemonic(ss, "", false); - printVecReg(ss, dest, true); - ccprintf(ss, ", "); - printVecReg(ss, op1, true); - ccprintf(ss, ", "); - printVecReg(ss, op2, true); - ccprintf(ss, "["); - ccprintf(ss, "%lu", imm); - ccprintf(ss, "]"); - return ss.str(); -} - - - std::string SveTerUnpredOp::generateDisassembly( Addr pc, const loader::SymbolTable *symtab) const diff --git a/src/arch/arm/insts/sve.hh b/src/arch/arm/insts/sve.hh index c23a975c8b..dc18ff30a7 100644 --- a/src/arch/arm/insts/sve.hh +++ b/src/arch/arm/insts/sve.hh @@ -498,27 +498,6 @@ class SveTerPredOp : public ArmStaticInst Addr pc, const loader::SymbolTable *symtab) const override; }; -/// Ternary, destructive, unpredicated , !INDEXED! SVE Instruction -class SveTerIndexedOp : public ArmStaticInst -{ - protected: - RegIndex dest, op1, op2; - uint16_t imm; - uint8_t esize; - - SveTerIndexedOp(const char* mnem, ExtMachInst _machInst, - OpClass __opClass, RegIndex _dest, - RegIndex _op1, RegIndex _op2, uint16_t _imm) : - ArmStaticInst(mnem, _machInst, __opClass), - dest(_dest), op1(_op1), op2(_op2) , imm(_imm) - {} - - std::string generateDisassembly( - Addr pc, const loader::SymbolTable *symtab) const override; -}; - - - /// Ternary, destructive, unpredicated SVE instruction. class SveTerUnpredOp : public ArmStaticInst { diff --git a/src/arch/arm/isa/formats/sve_2nd_level.isa b/src/arch/arm/isa/formats/sve_2nd_level.isa index 83c811cafd..dce4f9e998 100644 --- a/src/arch/arm/isa/formats/sve_2nd_level.isa +++ b/src/arch/arm/isa/formats/sve_2nd_level.isa @@ -250,50 +250,56 @@ namespace Aarch64 { RegIndex zda = (RegIndex) (uint8_t) bits(machInst, 4, 0); RegIndex zn = (RegIndex) (uint8_t) bits(machInst, 9, 5); - RegIndex zm ; - - //= (RegIndex) (uint8_t) bits(machInst, 19, 16); - uint8_t size = bits(machInst, 23, 22); - uint16_t imm; uint8_t opc = (bits(machInst, 10)); switch(size) { case 0b00: case 0b01: - zm = (RegIndex)(uint8_t)bits(machInst, 18, 16); - imm = (uint16_t)(bits(machInst, 22) << 2) + { + + RegIndex zm_16 = (RegIndex)(uint8_t)bits(machInst, 18, 16); + uint8_t imm_16 = (uint8_t)(bits(machInst, 22) << 2) | bits(machInst, 20, 19); - switch(opc) { - case 0x0: return new Sve2Mlai( - machInst, zda, zn, zm, imm); - case 0x1: return new Sve2Mlsi( - machInst, zda, zn, zm, imm); - } + switch(opc) + { + case 0x0: return new Sve2Mlai( + machInst, zda, zn, zm_16, imm_16); + case 0x1: return new Sve2Mlsi( + machInst, zda, zn, zm_16, imm_16); + } + } break; case 0b10: - zm = (RegIndex)(uint8_t)bits(machInst, 18, 16); - imm = (uint16_t)bits(machInst, 20, 19); + { + + RegIndex zm_32 = (RegIndex)(uint8_t)bits(machInst, 18, 16); + uint8_t imm_32 = (uint8_t)bits(machInst, 20, 19); switch(opc) { case 0x0: return new Sve2Mlai( - machInst, zda, zn, zm, imm); + machInst, zda, zn, zm_32, imm_32); case 0x1: return new Sve2Mlsi( - machInst, zda, zn, zm, imm); + machInst, zda, zn, zm_32, imm_32); } + } break; case 0b11: - zm = (RegIndex)(uint8_t)bits(machInst, 19, 16); - imm = (uint16_t)bits(machInst, 20); + { + + RegIndex zm_64 = (RegIndex)(uint8_t)bits(machInst, 19, 16); + uint8_t imm_64 = (uint8_t)bits(machInst, 20); switch(opc) { case 0x0: return new Sve2Mlai( - machInst, zda, zn, zm, imm); + machInst, zda, zn, zm_64, imm_64); case 0x1: return new Sve2Mlsi( - machInst, zda, zn, zm, imm); + machInst, zda, zn, zm_64, imm_64); } + } break; } + return new Unknown64(machInst); } // decodeSveMultiplyAccIndexed diff --git a/src/arch/arm/isa/insts/sve.isa b/src/arch/arm/isa/insts/sve.isa index e222f97f6f..e206106c61 100644 --- a/src/arch/arm/isa/insts/sve.isa +++ b/src/arch/arm/isa/insts/sve.isa @@ -2096,34 +2096,6 @@ let {{ 'class_name' : 'Sve' + Name} exec_output += SveOpExecDeclare.subst(substDict) - # Generates definitions for ternary SVE instructions (indexed) - def sveTerInstIndexed(name, Name, opClass, types, op, decoder='Generic'): - global header_output, exec_output, decoders - code = sveEnabledCheckCode + ''' - unsigned eCount = ArmStaticInst::getCurSveVecLen( - xc->tcBase()); - for (unsigned i = 0; i < eCount; i++) { - int segbase = i - i % (128 / sizeof(Element)); - int s = segbase + imm; - const Element& srcElem1 = AA64FpOp1_x[i]; - const Element& srcElem2 = AA64FpOp2_x[s]; - Element destElem = AA64FpDestMerge_x[i]; - ''' - - code += f"{op} \n" - - - code += ''' AA64FpDest_x[i] = destElem; - }''' - iop = ArmInstObjParams(name, 'Sve2' + Name+ 'i', 'SveTerIndexedOp', - {'code': code, 'op_class': opClass}, []) - header_output += SveTerIndexedOpDeclare.subst(iop) - exec_output += SveOpExecute.subst(iop) - for type in types: - substDict = {'targs' : type, - 'class_name' : 'Sve2' + Name + "i"} - exec_output += SveOpExecDeclare.subst(substDict) - # Generates definitions for ternary SVE intructions (always predicated - # merging) def sveTerInst(name, Name, opClass, types, op, decoder='Generic'): @@ -4276,13 +4248,13 @@ let {{ mlaCode = 'destElem += srcElem1 * srcElem2;' sveTerInst('mla', 'Mla', 'SimdMultAccOp', signedTypes, mlaCode) #indexed - sveTerInstIndexed('mla', 'Mla', 'SimdMultAccOp', signedTypes, mlaCode) + sveTerIdxInst('mla', '2Mlai', 'SimdMultAccOp', signedTypes, mlaCode) # MLS mlsCode = 'destElem -= srcElem1 * srcElem2;' sveTerInst('mls', 'Mls', 'SimdMultAccOp', signedTypes, mlsCode) #indexed - sveTerInstIndexed('mls', 'Mls', 'SimdMultAccOp', signedTypes, mlsCode) + sveTerIdxInst('mls', '2Mlsi', 'SimdMultAccOp', signedTypes, mlsCode) # ADCLT adcltCode = 'res = srcElem1 + srcElem2 + carryIn;' diff --git a/src/arch/arm/isa/templates/sve.isa b/src/arch/arm/isa/templates/sve.isa index b254b1b43d..ccca96022c 100644 --- a/src/arch/arm/isa/templates/sve.isa +++ b/src/arch/arm/isa/templates/sve.isa @@ -517,40 +517,6 @@ class %(class_name)s : public %(base_class)s -def template SveTerIndexedOpDeclare {{ - /* - For mla indexed version as it is not included in gem5 right now. - Using ternary ops but all ops are vector regs. - index is the imm here. (name can be changed) -*/ -template -class %(class_name)s : public %(base_class)s -{ - //static_assert(sizeof(_SElementA) == sizeof(_SElementB), - // "Source elements must have the same size."); - - private: - %(reg_idx_arr_decl)s; - - protected: - typedef _Element Element; - typedef _Element TPElem; - - public: - // Constructor - %(class_name)s(ExtMachInst machInst, RegIndex _dest, - RegIndex _op1, RegIndex _op2, uint16_t _imm) : - %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, - _dest, _op1, _op2,_imm) - { - %(set_reg_idx_arr)s; - %(constructor)s; - esize = sizeof(Element); - } - - Fault execute(ExecContext *, trace::InstRecord *) const override; -}; -}}; def template SveMatMulOpDeclare {{ From ee4c6a9bac6a8838237c369f8c23a878d317c557 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Tue, 5 Dec 2023 14:54:12 -0800 Subject: [PATCH 004/521] arch-riscv: Update riscv matched boad - Update riscv matched board to work with new RiscvBootloaderKernelWorkload Change-Id: Ic20b964f33e73b76775bfe18798bd667f36253f6 --- .../riscvmatched/riscvmatched_board.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py index c735313a6a..f827ec7bff 100644 --- a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py +++ b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py @@ -45,7 +45,7 @@ from m5.objects import ( PMAChecker, Port, RawDiskImage, - RiscvLinux, + RiscvBootloaderKernelWorkload, RiscvMmioVirtIO, RiscvRTC, VirtIOBlock, @@ -144,7 +144,7 @@ class RISCVMatchedBoard( @overrides(AbstractSystemBoard) def _setup_board(self) -> None: if self._fs: - self.workload = RiscvLinux() + self.workload = RiscvBootloaderKernelWorkload() # Contains a CLINT, PLIC, UART, and some functions for the dtb, etc. self.platform = HiFive() @@ -310,6 +310,18 @@ class RISCVMatchedBoard( self.mem_ranges = [AddrRange(memory.get_size())] memory.set_memory_range(self.mem_ranges) + @overrides(AbstractSystemBoard) + def _pre_instantiate(self): + if len(self._bootloader) > 0: + self.workload.bootloader_addr = 0x0 + self.workload.bootloader_filename = self._bootloader[0] + self.workload.kernel_addr = 0x80200000 + self.workload.entry_point = 0x80000000 # Bootloader starting point + else: + self.workload.kernel_addr = 0x0 + self.workload.entry_point = 0x80000000 + self._connect_things() + def generate_device_tree(self, outdir: str) -> None: """Creates the ``dtb`` and ``dts`` files. @@ -588,7 +600,7 @@ class RISCVMatchedBoard( kernel_args: Optional[List[str]] = None, exit_on_work_items: bool = True, ) -> None: - self.workload = RiscvLinux() + self.workload = RiscvBootloaderKernelWorkload() KernelDiskWorkload.set_kernel_disk_workload( self=self, kernel=kernel, From e2b3f0b8e4e50e40637361fe095ca2cfef914430 Mon Sep 17 00:00:00 2001 From: Yu-hsin Wang Date: Wed, 6 Dec 2023 13:25:02 +0800 Subject: [PATCH 005/521] mem: Add a flag on AbstractMemory to control statistics collection The stats initialization in the AbstractMemory allocates the space according to the max requestors of the System. This may cause issues in multiple system simulation. Given there are two system A and B. A has one requestor and a memory, while B has two requestors. When the requestor with requestor id 2 sending requests to the meomry in A, the simulator would crash because requestor id 2 is out of the allocated space. Current solution is adding a SysBridge between across A and B which would rewrite the requestor id to a valid one. This solution works but it needs to the bridge at the correct boundary which may not easy. In addition, the stats would record a mapped data which may not accurate. To reduce the complexity, we add an flag to AbstractMemory to control the stats. If users don't want the statistics and want to solve the cross system issue simply, users can disable the statistics collection. We also makes the flag by default True to not disturb current users. Change-Id: Ibb46a63d216d4f310b3e920815a295073496ea6e --- src/mem/AbstractMemory.py | 2 ++ src/mem/abstract_mem.cc | 23 ++++++++++++++--------- src/mem/abstract_mem.hh | 3 +++ 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/src/mem/AbstractMemory.py b/src/mem/AbstractMemory.py index 57e47adcb1..32d18adab2 100644 --- a/src/mem/AbstractMemory.py +++ b/src/mem/AbstractMemory.py @@ -76,3 +76,5 @@ class AbstractMemory(ClockedObject): ) writeable = Param.Bool(True, "Allow writes to this memory") + + collect_stats = Param.Bool(True, "Collect traffic statistics") diff --git a/src/mem/abstract_mem.cc b/src/mem/abstract_mem.cc index 9340f7e96f..f22ea35527 100644 --- a/src/mem/abstract_mem.cc +++ b/src/mem/abstract_mem.cc @@ -63,8 +63,8 @@ AbstractMemory::AbstractMemory(const Params &p) : MemBackdoor::Readable | MemBackdoor::Writeable : MemBackdoor::Readable)), confTableReported(p.conf_table_reported), inAddrMap(p.in_addr_map), - kvmMap(p.kvm_map), writeable(p.writeable), _system(NULL), - stats(*this) + kvmMap(p.kvm_map), writeable(p.writeable), collectStats(p.collect_stats), + _system(NULL), stats(*this) { panic_if(!range.valid() || !range.size(), "Memory range %s must be valid with non-zero size.", @@ -433,7 +433,8 @@ AbstractMemory::access(PacketPtr pkt) assert(!pkt->req->isInstFetch()); TRACE_PACKET("Read/Write"); - stats.numOther[pkt->req->requestorId()]++; + if (collectStats) + stats.numOther[pkt->req->requestorId()]++; } } else if (pkt->isRead()) { assert(!pkt->isWrite()); @@ -447,10 +448,12 @@ AbstractMemory::access(PacketPtr pkt) pkt->setData(host_addr); } TRACE_PACKET(pkt->req->isInstFetch() ? "IFetch" : "Read"); - stats.numReads[pkt->req->requestorId()]++; - stats.bytesRead[pkt->req->requestorId()] += pkt->getSize(); - if (pkt->req->isInstFetch()) - stats.bytesInstRead[pkt->req->requestorId()] += pkt->getSize(); + if (collectStats) { + stats.numReads[pkt->req->requestorId()]++; + stats.bytesRead[pkt->req->requestorId()] += pkt->getSize(); + if (pkt->req->isInstFetch()) + stats.bytesInstRead[pkt->req->requestorId()] += pkt->getSize(); + } } else if (pkt->isInvalidate() || pkt->isClean()) { assert(!pkt->isWrite()); // in a fastmem system invalidating and/or cleaning packets @@ -466,8 +469,10 @@ AbstractMemory::access(PacketPtr pkt) } assert(!pkt->req->isInstFetch()); TRACE_PACKET("Write"); - stats.numWrites[pkt->req->requestorId()]++; - stats.bytesWritten[pkt->req->requestorId()] += pkt->getSize(); + if (collectStats) { + stats.numWrites[pkt->req->requestorId()]++; + stats.bytesWritten[pkt->req->requestorId()] += pkt->getSize(); + } } } else { panic("Unexpected packet %s", pkt->print()); diff --git a/src/mem/abstract_mem.hh b/src/mem/abstract_mem.hh index 7f12487421..8c85f4503e 100644 --- a/src/mem/abstract_mem.hh +++ b/src/mem/abstract_mem.hh @@ -132,6 +132,9 @@ class AbstractMemory : public ClockedObject // Are writes allowed to this memory const bool writeable; + // Should collect traffic statistics + const bool collectStats; + std::list lockedAddrList; // helper function for checkLockedAddrs(): we really want to From e4dccbea8a63ffa235512f356c0aa12abf4087dc Mon Sep 17 00:00:00 2001 From: Matthias Boettcher Date: Tue, 1 Oct 2019 14:23:18 +0100 Subject: [PATCH 006/521] arch-arm: Partial SVE2 Implementation Instructions added: BGRP, RAX1, EOR3, BCAX, XAR & TBX, PMUL, PMULLB/T, SMULLB/T and UMULLB/T Change-Id: Ia135ba9300eae312b24342bcbda835fef6867113 --- src/arch/arm/isa/formats/sve_2nd_level.isa | 219 ++++++++++++++++++- src/arch/arm/isa/formats/sve_top_level.isa | 26 ++- src/arch/arm/isa/insts/sve.isa | 241 ++++++++++++++++++++- 3 files changed, 464 insertions(+), 22 deletions(-) diff --git a/src/arch/arm/isa/formats/sve_2nd_level.isa b/src/arch/arm/isa/formats/sve_2nd_level.isa index 86c174d7c4..9b5ef4dbce 100644 --- a/src/arch/arm/isa/formats/sve_2nd_level.isa +++ b/src/arch/arm/isa/formats/sve_2nd_level.isa @@ -509,6 +509,193 @@ namespace Aarch64 return new Unknown64(machInst); } // decodeSveIntArithUnpred + StaticInstPtr + decodeSveIntMulUnpred(ExtMachInst machInst) + { + RegIndex zd = (RegIndex) (uint8_t) bits(machInst, 4, 0); + RegIndex zn = (RegIndex) (uint8_t) bits(machInst, 9, 5); + RegIndex zm = (RegIndex) (uint8_t) bits(machInst, 20, 16); + uint8_t opc = bits(machInst, 11, 10); + uint8_t size = bits(machInst, 23, 22); + + switch (opc) { + case 0x1: + if (size == 0x0) { + return new SvePmul(machInst, zd, zn, zm); + } + [[fallthrough]]; + case 0x0: + // MUL (vectors, unpredicated) + case 0x2: + // SMULH (unpredicated) + case 0x3: + // UMULH (unpredicated) + default: + return new Unknown64(machInst); + } + + } // decodeSveIntMulUnpred + + StaticInstPtr + decodeSveIntTerUnpred(ExtMachInst machInst) + { + RegIndex zdn = (RegIndex) (uint8_t) bits(machInst, 4, 0); + RegIndex zk = (RegIndex) (uint8_t) bits(machInst, 9, 5); + RegIndex zm = (RegIndex) (uint8_t) bits(machInst, 20, 16); + uint8_t opc = bits(machInst, 23, 22) << 1 | bits(machInst, 10); + + switch (opc) { + case 0x0: + return new SveEor3(machInst, zdn, zm, zk); + case 0x2: + return new SveBcax(machInst, zdn, zm, zk); + case 0x1: + // BSL + case 0x3: + // BSL1N + case 0x5: + // BSL2N + case 0x7: + // NBSL + default: + return new Unknown64(machInst); + } + } // decodeSveIntTerUnpred + + StaticInstPtr + decodeSve2IntMulLong(ExtMachInst machInst) + { + RegIndex zd = (RegIndex) (uint8_t) bits(machInst, 4, 0); + RegIndex zn = (RegIndex) (uint8_t) bits(machInst, 9, 5); + RegIndex zm = (RegIndex) (uint8_t) bits(machInst, 20, 16); + uint8_t opc_u_t = bits(machInst, 12, 10); + uint8_t size = bits(machInst, 23, 22); + + switch (opc_u_t) { + case 0x2: + return decodeSveBinUnpredS2( + size, machInst, zd, zn, zm); + case 0x3: + return decodeSveBinUnpredS2( + size, machInst, zd, zn, zm); + case 0x4: + return decodeSveBinUnpredSigned( + size, machInst, zd, zn, zm); + case 0x5: + return decodeSveBinUnpredSigned( + size, machInst, zd, zn, zm); + case 0x6: + return decodeSveBinUnpredUnsigned( + size, machInst, zd, zn, zm); + case 0x7: + return decodeSveBinUnpredUnsigned( + size, machInst, zd, zn, zm); + case 0x0: + // SQDMULLB + case 0x1: + // SQDMULLT + default: + return new Unknown64(machInst); + } + } // decodeSve2IntMulLong + + StaticInstPtr + decodeSve2BitPerm(ExtMachInst machInst) + { + RegIndex zd = (RegIndex) (uint8_t) bits(machInst, 4, 0); + RegIndex zn = (RegIndex) (uint8_t) bits(machInst, 9, 5); + RegIndex zm = (RegIndex) (uint8_t) bits(machInst, 20, 16); + uint8_t opc = bits(machInst, 11, 10); + uint8_t size = bits(machInst, 23, 22); + + switch (opc) { + case 0x2: + return decodeSveBinUnpredU( + size, machInst, zd, zn, zm); + case 0x0: + // BEXT + case 0x1: + // BDEP + default: + return new Unknown64(machInst); + } + } // decodeSve2BitPerm + + StaticInstPtr + decodeSveIntRotImm(ExtMachInst machInst) + { + RegIndex zdn = (RegIndex) (uint8_t) bits(machInst, 4, 0); + RegIndex zm = (RegIndex) (uint8_t) bits(machInst, 9, 5); + uint8_t imm3 = (RegIndex) (uint8_t) bits(machInst, 18, 16); + + uint8_t tsize = (bits(machInst, 23, 22) << 2) | bits(machInst, 20, 19); + uint8_t esize = 0; + uint8_t size = 0; + + if (tsize == 0x0) { + return new Unknown64(machInst); + } else if (tsize == 0x1) { + esize = 8; + } else if ((tsize & 0x0E) == 0x2) { + esize = 16; + size = 1; + } else if ((tsize & 0x0C) == 0x4) { + esize = 32; + size = 2; + } else if ((tsize & 0x08) == 0x8) { + esize = 64; + size = 3; + } + + unsigned rot_am = 2 * esize - ((tsize << 3) | imm3); + return decodeSveBinImmDestrUnpredU( + size, machInst, zdn, zm, rot_am); + } // decodeSveIntRotImm + + StaticInstPtr + decodeSve2CryptBinConstr(ExtMachInst machInst) + { + RegIndex zd = (RegIndex) (uint8_t) bits(machInst, 4, 0); + RegIndex zn = (RegIndex) (uint8_t) bits(machInst, 9, 5); + RegIndex zm = (RegIndex) (uint8_t) bits(machInst, 20, 16); + uint8_t size = bits(machInst, 23, 22); + uint8_t opc = bits(machInst, 10); + uint8_t size_opc = (size << 1) | opc; + + switch (size_opc) { + case 0x1: + return new SveRax1(machInst, zd, zn, zm); + case 0x0: + // SM4EKEY + default: + return new Unknown64(machInst); + } + } // decodeSve2CryptBinConstr + + StaticInstPtr + decodeSve2WideIntArith(ExtMachInst machInst) + { + uint8_t op0 = bits(machInst, 14, 13); + switch (op0) { + case 0b11: + return decodeSve2IntMulLong(machInst); + default: + return new Unknown64(machInst); + } + } + + StaticInstPtr + decodeSve2Crypto(ExtMachInst machInst) + { + uint8_t op2 = bits(machInst, 12, 11); + switch (op2) { + case 0b10: + return decodeSve2CryptBinConstr(machInst); + default: + return new Unknown64(machInst); + } + } + StaticInstPtr decodeSveIntLogUnpred(ExtMachInst machInst) { @@ -1041,12 +1228,19 @@ namespace Aarch64 decodeSvePermUnpred(ExtMachInst machInst) { uint8_t b12_10 = bits(machInst, 12, 10); - if (b12_10 == 0x4) { + if ((b12_10 == 0x4) || (bits(machInst, 12, 11) == 0x1)) { unsigned size = (unsigned) bits(machInst, 23, 22); RegIndex zd = (RegIndex) (uint8_t) bits(machInst, 4, 0); RegIndex zn = (RegIndex) (uint8_t) bits(machInst, 9, 5); RegIndex zm = (RegIndex) (uint8_t) bits(machInst, 20, 16); - return decodeSveBinUnpredU(size, machInst, zd, zn, zm); + if (b12_10 == 0x4) { // TBL, two sources + return decodeSveBinUnpredU(size, machInst, zd, zn, zm); + } else if (bits(machInst, 10) == 0x1) { // TBX + return decodeSveBinUnpredU(size, machInst, zd, zn, zm); + // } else { // TBL, three sources + // TBL, three sources + } + return new Unknown64(machInst); } else if (bits(machInst, 20, 16) == 0x0 && b12_10 == 0x6) { uint8_t size = bits(machInst, 23, 22); RegIndex rn = makeSP( @@ -1391,7 +1585,6 @@ namespace Aarch64 RegIndex zn = (RegIndex) (uint8_t) bits(machInst, 9, 5); RegIndex pg = (RegIndex) (uint8_t) bits(machInst, 13, 10); RegIndex zm = (RegIndex) (uint8_t) bits(machInst, 20, 16); - uint8_t size = bits(machInst, 23, 22); return decodeSveBinConstrPredU(size, @@ -3877,16 +4070,18 @@ namespace Aarch64 } // decodeSveMemStore StaticInstPtr - decodeSveMisc(ExtMachInst machInst) { + decodeSveMisc(ExtMachInst machInst) + { switch(bits(machInst, 13, 10)) { - case 0b0110: { - return decodeSveIntMatMulAdd(machInst); - break; - } - default: { - return new Unknown64(machInst); - break; - } + case 0b0110: + return decodeSveIntMatMulAdd(machInst); + case 0b1100: + case 0b1101: + case 0b1110: + case 0b1111: + return decodeSve2BitPerm(machInst); + default: + return new Unknown64(machInst); } return new Unknown64(machInst); } // decodeSveMisc diff --git a/src/arch/arm/isa/formats/sve_top_level.isa b/src/arch/arm/isa/formats/sve_top_level.isa index cb390eb972..9ae075ba2d 100644 --- a/src/arch/arm/isa/formats/sve_top_level.isa +++ b/src/arch/arm/isa/formats/sve_top_level.isa @@ -45,7 +45,9 @@ namespace Aarch64 StaticInstPtr decodeSveIntArithUnaryPred(ExtMachInst machInst); StaticInstPtr decodeSveIntMulAdd(ExtMachInst machInst); StaticInstPtr decodeSveIntMatMulAdd(ExtMachInst machInst); + StaticInstPtr decodeSveIntMulUnpred(ExtMachInst machInst); StaticInstPtr decodeSveIntArithUnpred(ExtMachInst machInst); + StaticInstPtr decodeSveIntTerUnpred(ExtMachInst machInst); StaticInstPtr decodeSveIntLogUnpred(ExtMachInst machInst); StaticInstPtr decodeSveIndexGen(ExtMachInst machInst); StaticInstPtr decodeSveStackAlloc(ExtMachInst machInst); @@ -71,6 +73,12 @@ namespace Aarch64 StaticInstPtr decodeSveIntWideImmUnpred(ExtMachInst machInst); StaticInstPtr decodeSveClamp(ExtMachInst machInst); StaticInstPtr decodeSve2Accum(ExtMachInst machInst); + StaticInstPtr decodeSveIntRotImm(ExtMachInst machInst); + StaticInstPtr decodeSve2CryptBinConstr(ExtMachInst machInst); + StaticInstPtr decodeSve2BitPerm(ExtMachInst machInst); + StaticInstPtr decodeSve2IntMulLong(ExtMachInst machInst); + StaticInstPtr decodeSve2WideIntArith(ExtMachInst machInst); + StaticInstPtr decodeSve2Crypto(ExtMachInst machInst); StaticInstPtr decodeSveIntegerDotProductUnpred(ExtMachInst machInst); StaticInstPtr decodeSveIntegerDotProductIndexed(ExtMachInst machInst); @@ -129,10 +137,14 @@ namespace Aarch64 break; case 0b10: case 0b11: - if (bits(machInst, 21) == 0b0 && op2 == 0b10) { + if (bits(machInst, 21) == 0b0 && bits(op2, 1) == 0b0) { + return decodeSve2WideIntArith(machInst); + } else if (bits(machInst, 21) == 0b0 && op2 == 0b10) { return decodeSveMisc(machInst); } else if (bits(machInst, 21) == 0b0 && op2 == 0b11) { return decodeSve2Accum(machInst); + } else if (bits(machInst, 21) == 0b1 && bits(machInst, 15, 13) == 0b111) { + return decodeSve2Crypto(machInst); } else { return new Unknown64(machInst); } @@ -180,7 +192,15 @@ namespace Aarch64 switch (b_15_14) { case 0x0: if (b_13) { - return decodeSveIntLogUnpred(machInst); + if (bits(machInst, 11)) { + return decodeSveIntTerUnpred(machInst); + } else { + if (bits(machInst, 10)) { + return decodeSveIntRotImm(machInst); + } else { + return decodeSveIntLogUnpred(machInst); + } + } } else { if (!bits(machInst, 30)) { return decodeSveIntArithUnpred(machInst); @@ -189,7 +209,7 @@ namespace Aarch64 break; case 0x1: if (b_13) { - return new Unknown64(machInst); + return decodeSveIntMulUnpred(machInst); } else if (b_12) { return decodeSveStackAlloc(machInst); } else { diff --git a/src/arch/arm/isa/insts/sve.isa b/src/arch/arm/isa/insts/sve.isa index 9999843b59..a0cbd8711b 100644 --- a/src/arch/arm/isa/insts/sve.isa +++ b/src/arch/arm/isa/insts/sve.isa @@ -325,6 +325,28 @@ output header {{ } } + + // Decodes binary with immediate operand, destructive, unpredicated + // SVE instructions, handling unsigned variants only. + template