qemu-patch-raspberry4/tcg/i386/tcg-target.h

234 lines
7.6 KiB
C
Raw Normal View History

/*
* Tiny Code Generator for QEMU
*
* Copyright (c) 2008 Fabrice Bellard
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef I386_TCG_TARGET_H
#define I386_TCG_TARGET_H
#define TCG_TARGET_INSN_UNIT_SIZE 1
#define TCG_TARGET_TLB_DISPLACEMENT_BITS 31
tcg/i386: enable dynamic TLB sizing As the following experiments show, this series is a net perf gain, particularly for memory-heavy workloads. Experiments are run on an Intel(R) Xeon(R) Gold 6142 CPU @ 2.60GHz. 1. System boot + shudown, debian aarch64: - Before (v3.1.0): Performance counter stats for './die.sh v3.1.0' (10 runs): 9019.797015 task-clock (msec) # 0.993 CPUs utilized ( +- 0.23% ) 29,910,312,379 cycles # 3.316 GHz ( +- 0.14% ) 54,699,252,014 instructions # 1.83 insn per cycle ( +- 0.08% ) 10,061,951,686 branches # 1115.541 M/sec ( +- 0.08% ) 172,966,530 branch-misses # 1.72% of all branches ( +- 0.07% ) 9.084039051 seconds time elapsed ( +- 0.23% ) - After: Performance counter stats for './die.sh tlb-dyn-v5' (10 runs): 8624.084842 task-clock (msec) # 0.993 CPUs utilized ( +- 0.23% ) 28,556,123,404 cycles # 3.311 GHz ( +- 0.13% ) 51,755,089,512 instructions # 1.81 insn per cycle ( +- 0.05% ) 9,526,513,946 branches # 1104.641 M/sec ( +- 0.05% ) 166,578,509 branch-misses # 1.75% of all branches ( +- 0.19% ) 8.680540350 seconds time elapsed ( +- 0.24% ) That is, a 4.4% perf increase. 2. System boot + shutdown, ubuntu 18.04 x86_64: - Before (v3.1.0): 56100.574751 task-clock (msec) # 1.016 CPUs utilized ( +- 4.81% ) 200,745,466,128 cycles # 3.578 GHz ( +- 5.24% ) 431,949,100,608 instructions # 2.15 insn per cycle ( +- 5.65% ) 77,502,383,330 branches # 1381.490 M/sec ( +- 6.18% ) 844,681,191 branch-misses # 1.09% of all branches ( +- 3.82% ) 55.221556378 seconds time elapsed ( +- 5.01% ) - After: 56603.419540 task-clock (msec) # 1.019 CPUs utilized ( +- 10.19% ) 202,217,930,479 cycles # 3.573 GHz ( +- 10.69% ) 439,336,291,626 instructions # 2.17 insn per cycle ( +- 14.14% ) 80,538,357,447 branches # 1422.853 M/sec ( +- 16.09% ) 776,321,622 branch-misses # 0.96% of all branches ( +- 3.77% ) 55.549661409 seconds time elapsed ( +- 10.44% ) No improvement (within noise range). Note that for this workload, increasing the time window too much can lead to perf degradation, since it flushes the TLB *very* frequently. 3. x86_64 SPEC06int: x86_64-softmmu speedup vs. v3.1.0 for SPEC06int (test set) Host: Intel(R) Xeon(R) Gold 6142 CPU @ 2.60GHz (Skylake) 5.5 +------------------------------------------------------------------------+ | +-+ | 5 |-+.................+-+...............................tlb-dyn-v5.......+-| | * * | 4.5 |-+.................*.*................................................+-| | * * | 4 |-+.................*.*................................................+-| | * * | 3.5 |-+.................*.*................................................+-| | * * | 3 |-+......+-+*.......*.*................................................+-| | * * * * | 2.5 |-+......*..*.......*.*.................................+-+*...........+-| | * * * * * * | 2 |-+......*..*.......*.*.................................*..*...........+-| | * * * * * * +-+ | 1.5 |-+......*..*.......*.*.................................*..*.*+-+.*+-+.+-| | * * *+-+ * * +-+ *+-+ +-+ +-+ * * * * * * | 1 |++++-+*+*++*+*++*++*+*++*+*+++-+*+*+-++*+-++++-++++-+++*++*+*++*+*++*+++| | * * * * * * * * * * * * * * * * * * * * * * * * * * | 0.5 +------------------------------------------------------------------------+ 400.perlb401.bzip403.g429445.g456.hm462.libq464.h471.omn47483.xalancbgeomean png: https://imgur.com/YRF90f7 That is, a 1.51x average speedup over the baseline, with a max speedup of 5.17x. Here's a different look at the SPEC06int results, using KVM as the baseline: x86_64-softmmu slowdown vs. KVM for SPEC06int (test set) Host: Intel(R) Xeon(R) Gold 6142 CPU @ 2.60GHz (Skylake) 25 +---------------------------------------------------------------------------+ | +-+ +-+ | | * * +-+ v3.1.0 | | * * +-+ tlb-dyn-v5 | | * * * * +-+ | 20 |-+.................*.*.............................*.+-+......*.*........+-| | * * * # # * * | | +-+ * * * # # * * | | * * * * * # # * * | 15 |-+......*.*........*.*.............................*.#.#......*.+-+......+-| | * * * * * # # * #|# | | * * * * +-+ * # # * +-+ | | * * +-+ * * ++-+ +-+ * # # * # # +-+ | | * * +-+ * * * ## *| +-+ * # # * # # +-+ | 10 |-+......*.*..*.+-+.*.*........*.##.......++-+.*.+-+*.#.#......*.#.#.*.*..+-| | * * * +-+ * * * ## +-+ *# # * # #* # # +-+ * # # * * | | * * * # # * * +-+ * ## * +-+ *# # * # #* # # * * * # # *+-+ | | * * * # # * * * +-+ * ## * # # *# # * # #* # # * * * # # * ## | 5 |-+......*.+-+*.#.#.*.*..*.#.#.*.##.*.#.#.*#.#.*.#.#*.#.#.*.*..*.#.#.*.##.+-| | * # #* # # * +-+* # # * ## * # # *# # * # #* # # * * * # # * ## | | * # #* # # * # #* # # * ## * # # *# # * # #* # # * +-+* # # * ## | | ++-+ * # #* # # * # #* # # * ## * # # *# # * # #* # # * # #* # # * ## | |+++*#+#+*+#+#*+#+#+*+#+#*+#+#+*+##+*+#+#+*#+#+*+#+#*+#+#+*+#+#*+#+#+*+##+++| 0 +---------------------------------------------------------------------------+ 400.perlbe401.bzi403.gc429445.go456.h462.libqu464.h471.omne4483.xalancbmgeomean png: https://imgur.com/YzAMNEV After this series, we bring down the average SPEC06int slowdown vs KVM from 11.47x to 7.58x. Tested-by: Alex Bennée <alex.bennee@linaro.org> Reviewed-by: Alex Bennée <alex.bennee@linaro.org> Signed-off-by: Emilio G. Cota <cota@braap.org> Message-Id: <20190116170114.26802-4-cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2019-01-16 18:01:14 +01:00
#define TCG_TARGET_IMPLEMENTS_DYN_TLB 1
#ifdef __x86_64__
# define TCG_TARGET_REG_BITS 64
# define TCG_TARGET_NB_REGS 32
#else
# define TCG_TARGET_REG_BITS 32
# define TCG_TARGET_NB_REGS 24
#endif
typedef enum {
TCG_REG_EAX = 0,
TCG_REG_ECX,
TCG_REG_EDX,
TCG_REG_EBX,
TCG_REG_ESP,
TCG_REG_EBP,
TCG_REG_ESI,
TCG_REG_EDI,
/* 64-bit registers; always define the symbols to avoid
too much if-deffing. */
TCG_REG_R8,
TCG_REG_R9,
TCG_REG_R10,
TCG_REG_R11,
TCG_REG_R12,
TCG_REG_R13,
TCG_REG_R14,
TCG_REG_R15,
TCG_REG_XMM0,
TCG_REG_XMM1,
TCG_REG_XMM2,
TCG_REG_XMM3,
TCG_REG_XMM4,
TCG_REG_XMM5,
TCG_REG_XMM6,
TCG_REG_XMM7,
/* 64-bit registers; likewise always define. */
TCG_REG_XMM8,
TCG_REG_XMM9,
TCG_REG_XMM10,
TCG_REG_XMM11,
TCG_REG_XMM12,
TCG_REG_XMM13,
TCG_REG_XMM14,
TCG_REG_XMM15,
TCG_REG_RAX = TCG_REG_EAX,
TCG_REG_RCX = TCG_REG_ECX,
TCG_REG_RDX = TCG_REG_EDX,
TCG_REG_RBX = TCG_REG_EBX,
TCG_REG_RSP = TCG_REG_ESP,
TCG_REG_RBP = TCG_REG_EBP,
TCG_REG_RSI = TCG_REG_ESI,
TCG_REG_RDI = TCG_REG_EDI,
TCG_AREG0 = TCG_REG_EBP,
TCG_REG_CALL_STACK = TCG_REG_ESP
} TCGReg;
/* used for function call generation */
#define TCG_TARGET_STACK_ALIGN 16
#if defined(_WIN64)
#define TCG_TARGET_CALL_STACK_OFFSET 32
#else
#define TCG_TARGET_CALL_STACK_OFFSET 0
#endif
extern bool have_bmi1;
extern bool have_popcnt;
extern bool have_avx1;
extern bool have_avx2;
/* optional instructions */
#define TCG_TARGET_HAS_div2_i32 1
#define TCG_TARGET_HAS_rot_i32 1
#define TCG_TARGET_HAS_ext8s_i32 1
#define TCG_TARGET_HAS_ext16s_i32 1
#define TCG_TARGET_HAS_ext8u_i32 1
#define TCG_TARGET_HAS_ext16u_i32 1
#define TCG_TARGET_HAS_bswap16_i32 1
#define TCG_TARGET_HAS_bswap32_i32 1
#define TCG_TARGET_HAS_neg_i32 1
#define TCG_TARGET_HAS_not_i32 1
#define TCG_TARGET_HAS_andc_i32 have_bmi1
#define TCG_TARGET_HAS_orc_i32 0
#define TCG_TARGET_HAS_eqv_i32 0
#define TCG_TARGET_HAS_nand_i32 0
#define TCG_TARGET_HAS_nor_i32 0
#define TCG_TARGET_HAS_clz_i32 1
#define TCG_TARGET_HAS_ctz_i32 1
#define TCG_TARGET_HAS_ctpop_i32 have_popcnt
#define TCG_TARGET_HAS_deposit_i32 1
#define TCG_TARGET_HAS_extract_i32 1
#define TCG_TARGET_HAS_sextract_i32 1
#define TCG_TARGET_HAS_movcond_i32 1
#define TCG_TARGET_HAS_add2_i32 1
#define TCG_TARGET_HAS_sub2_i32 1
#define TCG_TARGET_HAS_mulu2_i32 1
#define TCG_TARGET_HAS_muls2_i32 1
#define TCG_TARGET_HAS_muluh_i32 0
#define TCG_TARGET_HAS_mulsh_i32 0
#define TCG_TARGET_HAS_goto_ptr 1
#define TCG_TARGET_HAS_direct_jump 1
#if TCG_TARGET_REG_BITS == 64
/* Keep target addresses zero-extended in a register. */
#define TCG_TARGET_HAS_extrl_i64_i32 (TARGET_LONG_BITS == 32)
#define TCG_TARGET_HAS_extrh_i64_i32 (TARGET_LONG_BITS == 32)
#define TCG_TARGET_HAS_div2_i64 1
#define TCG_TARGET_HAS_rot_i64 1
#define TCG_TARGET_HAS_ext8s_i64 1
#define TCG_TARGET_HAS_ext16s_i64 1
#define TCG_TARGET_HAS_ext32s_i64 1
#define TCG_TARGET_HAS_ext8u_i64 1
#define TCG_TARGET_HAS_ext16u_i64 1
#define TCG_TARGET_HAS_ext32u_i64 1
#define TCG_TARGET_HAS_bswap16_i64 1
#define TCG_TARGET_HAS_bswap32_i64 1
#define TCG_TARGET_HAS_bswap64_i64 1
#define TCG_TARGET_HAS_neg_i64 1
#define TCG_TARGET_HAS_not_i64 1
#define TCG_TARGET_HAS_andc_i64 have_bmi1
#define TCG_TARGET_HAS_orc_i64 0
#define TCG_TARGET_HAS_eqv_i64 0
#define TCG_TARGET_HAS_nand_i64 0
#define TCG_TARGET_HAS_nor_i64 0
#define TCG_TARGET_HAS_clz_i64 1
#define TCG_TARGET_HAS_ctz_i64 1
#define TCG_TARGET_HAS_ctpop_i64 have_popcnt
#define TCG_TARGET_HAS_deposit_i64 1
#define TCG_TARGET_HAS_extract_i64 1
#define TCG_TARGET_HAS_sextract_i64 0
#define TCG_TARGET_HAS_movcond_i64 1
#define TCG_TARGET_HAS_add2_i64 1
#define TCG_TARGET_HAS_sub2_i64 1
#define TCG_TARGET_HAS_mulu2_i64 1
#define TCG_TARGET_HAS_muls2_i64 1
#define TCG_TARGET_HAS_muluh_i64 0
#define TCG_TARGET_HAS_mulsh_i64 0
#endif
/* We do not support older SSE systems, only beginning with AVX1. */
#define TCG_TARGET_HAS_v64 have_avx1
#define TCG_TARGET_HAS_v128 have_avx1
#define TCG_TARGET_HAS_v256 have_avx2
#define TCG_TARGET_HAS_andc_vec 1
#define TCG_TARGET_HAS_orc_vec 0
#define TCG_TARGET_HAS_not_vec 0
#define TCG_TARGET_HAS_neg_vec 0
#define TCG_TARGET_HAS_shi_vec 1
#define TCG_TARGET_HAS_shs_vec 0
#define TCG_TARGET_HAS_shv_vec 0
#define TCG_TARGET_HAS_cmp_vec 1
#define TCG_TARGET_HAS_mul_vec 1
#define TCG_TARGET_HAS_sat_vec 1
#define TCG_TARGET_HAS_minmax_vec 1
#define TCG_TARGET_deposit_i32_valid(ofs, len) \
(((ofs) == 0 && (len) == 8) || ((ofs) == 8 && (len) == 8) || \
((ofs) == 0 && (len) == 16))
#define TCG_TARGET_deposit_i64_valid TCG_TARGET_deposit_i32_valid
/* Check for the possibility of high-byte extraction and, for 64-bit,
zero-extending 32-bit right-shift. */
#define TCG_TARGET_extract_i32_valid(ofs, len) ((ofs) == 8 && (len) == 8)
#define TCG_TARGET_extract_i64_valid(ofs, len) \
(((ofs) == 8 && (len) == 8) || ((ofs) + (len)) == 32)
static inline void flush_icache_range(uintptr_t start, uintptr_t stop)
{
}
static inline void tb_target_set_jmp_target(uintptr_t tc_ptr,
uintptr_t jmp_addr, uintptr_t addr)
{
/* patch the branch destination */
atomic_set((int32_t *)jmp_addr, addr - (jmp_addr + 4));
/* no need to flush icache explicitly */
}
/* This defines the natural memory order supported by this
* architecture before guarantees made by various barrier
* instructions.
*
* The x86 has a pretty strong memory ordering which only really
* allows for some stores to be re-ordered after loads.
*/
#include "tcg-mo.h"
#define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD)
#define TCG_TARGET_HAS_MEMORY_BSWAP 1
#ifdef CONFIG_SOFTMMU
#define TCG_TARGET_NEED_LDST_LABELS
#endif
#define TCG_TARGET_NEED_POOL_LABELS
#endif