2008-02-01 11:05:41 +01:00
|
|
|
/*
|
|
|
|
* Tiny Code Generator for QEMU
|
|
|
|
*
|
|
|
|
* Copyright (c) 2008 Fabrice Bellard
|
|
|
|
*
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
|
|
* of this software and associated documentation files (the "Software"), to deal
|
|
|
|
* in the Software without restriction, including without limitation the rights
|
|
|
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
|
|
* copies of the Software, and to permit persons to whom the Software is
|
|
|
|
* furnished to do so, subject to the following conditions:
|
|
|
|
*
|
|
|
|
* The above copyright notice and this permission notice shall be included in
|
|
|
|
* all copies or substantial portions of the Software.
|
|
|
|
*
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
|
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
|
|
* THE SOFTWARE.
|
|
|
|
*/
|
2016-06-29 11:14:47 +02:00
|
|
|
|
|
|
|
#ifndef I386_TCG_TARGET_H
|
|
|
|
#define I386_TCG_TARGET_H
|
2008-02-01 11:05:41 +01:00
|
|
|
|
2014-04-01 17:34:03 +02:00
|
|
|
#define TCG_TARGET_INSN_UNIT_SIZE 1
|
2015-05-05 09:18:22 +02:00
|
|
|
#define TCG_TARGET_TLB_DISPLACEMENT_BITS 31
|
tcg/i386: enable dynamic TLB sizing
As the following experiments show, this series is a net perf gain,
particularly for memory-heavy workloads. Experiments are run on an
Intel(R) Xeon(R) Gold 6142 CPU @ 2.60GHz.
1. System boot + shudown, debian aarch64:
- Before (v3.1.0):
Performance counter stats for './die.sh v3.1.0' (10 runs):
9019.797015 task-clock (msec) # 0.993 CPUs utilized ( +- 0.23% )
29,910,312,379 cycles # 3.316 GHz ( +- 0.14% )
54,699,252,014 instructions # 1.83 insn per cycle ( +- 0.08% )
10,061,951,686 branches # 1115.541 M/sec ( +- 0.08% )
172,966,530 branch-misses # 1.72% of all branches ( +- 0.07% )
9.084039051 seconds time elapsed ( +- 0.23% )
- After:
Performance counter stats for './die.sh tlb-dyn-v5' (10 runs):
8624.084842 task-clock (msec) # 0.993 CPUs utilized ( +- 0.23% )
28,556,123,404 cycles # 3.311 GHz ( +- 0.13% )
51,755,089,512 instructions # 1.81 insn per cycle ( +- 0.05% )
9,526,513,946 branches # 1104.641 M/sec ( +- 0.05% )
166,578,509 branch-misses # 1.75% of all branches ( +- 0.19% )
8.680540350 seconds time elapsed ( +- 0.24% )
That is, a 4.4% perf increase.
2. System boot + shutdown, ubuntu 18.04 x86_64:
- Before (v3.1.0):
56100.574751 task-clock (msec) # 1.016 CPUs utilized ( +- 4.81% )
200,745,466,128 cycles # 3.578 GHz ( +- 5.24% )
431,949,100,608 instructions # 2.15 insn per cycle ( +- 5.65% )
77,502,383,330 branches # 1381.490 M/sec ( +- 6.18% )
844,681,191 branch-misses # 1.09% of all branches ( +- 3.82% )
55.221556378 seconds time elapsed ( +- 5.01% )
- After:
56603.419540 task-clock (msec) # 1.019 CPUs utilized ( +- 10.19% )
202,217,930,479 cycles # 3.573 GHz ( +- 10.69% )
439,336,291,626 instructions # 2.17 insn per cycle ( +- 14.14% )
80,538,357,447 branches # 1422.853 M/sec ( +- 16.09% )
776,321,622 branch-misses # 0.96% of all branches ( +- 3.77% )
55.549661409 seconds time elapsed ( +- 10.44% )
No improvement (within noise range). Note that for this workload,
increasing the time window too much can lead to perf degradation,
since it flushes the TLB *very* frequently.
3. x86_64 SPEC06int:
x86_64-softmmu speedup vs. v3.1.0 for SPEC06int (test set)
Host: Intel(R) Xeon(R) Gold 6142 CPU @ 2.60GHz (Skylake)
5.5 +------------------------------------------------------------------------+
| +-+ |
5 |-+.................+-+...............................tlb-dyn-v5.......+-|
| * * |
4.5 |-+.................*.*................................................+-|
| * * |
4 |-+.................*.*................................................+-|
| * * |
3.5 |-+.................*.*................................................+-|
| * * |
3 |-+......+-+*.......*.*................................................+-|
| * * * * |
2.5 |-+......*..*.......*.*.................................+-+*...........+-|
| * * * * * * |
2 |-+......*..*.......*.*.................................*..*...........+-|
| * * * * * * +-+ |
1.5 |-+......*..*.......*.*.................................*..*.*+-+.*+-+.+-|
| * * *+-+ * * +-+ *+-+ +-+ +-+ * * * * * * |
1 |++++-+*+*++*+*++*++*+*++*+*+++-+*+*+-++*+-++++-++++-+++*++*+*++*+*++*+++|
| * * * * * * * * * * * * * * * * * * * * * * * * * * |
0.5 +------------------------------------------------------------------------+
400.perlb401.bzip403.g429445.g456.hm462.libq464.h471.omn47483.xalancbgeomean
png: https://imgur.com/YRF90f7
That is, a 1.51x average speedup over the baseline, with a max speedup
of 5.17x.
Here's a different look at the SPEC06int results, using KVM as the baseline:
x86_64-softmmu slowdown vs. KVM for SPEC06int (test set)
Host: Intel(R) Xeon(R) Gold 6142 CPU @ 2.60GHz (Skylake)
25 +---------------------------------------------------------------------------+
| +-+ +-+ |
| * * +-+ v3.1.0 |
| * * +-+ tlb-dyn-v5 |
| * * * * +-+ |
20 |-+.................*.*.............................*.+-+......*.*........+-|
| * * * # # * * |
| +-+ * * * # # * * |
| * * * * * # # * * |
15 |-+......*.*........*.*.............................*.#.#......*.+-+......+-|
| * * * * * # # * #|# |
| * * * * +-+ * # # * +-+ |
| * * +-+ * * ++-+ +-+ * # # * # # +-+ |
| * * +-+ * * * ## *| +-+ * # # * # # +-+ |
10 |-+......*.*..*.+-+.*.*........*.##.......++-+.*.+-+*.#.#......*.#.#.*.*..+-|
| * * * +-+ * * * ## +-+ *# # * # #* # # +-+ * # # * * |
| * * * # # * * +-+ * ## * +-+ *# # * # #* # # * * * # # *+-+ |
| * * * # # * * * +-+ * ## * # # *# # * # #* # # * * * # # * ## |
5 |-+......*.+-+*.#.#.*.*..*.#.#.*.##.*.#.#.*#.#.*.#.#*.#.#.*.*..*.#.#.*.##.+-|
| * # #* # # * +-+* # # * ## * # # *# # * # #* # # * * * # # * ## |
| * # #* # # * # #* # # * ## * # # *# # * # #* # # * +-+* # # * ## |
| ++-+ * # #* # # * # #* # # * ## * # # *# # * # #* # # * # #* # # * ## |
|+++*#+#+*+#+#*+#+#+*+#+#*+#+#+*+##+*+#+#+*#+#+*+#+#*+#+#+*+#+#*+#+#+*+##+++|
0 +---------------------------------------------------------------------------+
400.perlbe401.bzi403.gc429445.go456.h462.libqu464.h471.omne4483.xalancbmgeomean
png: https://imgur.com/YzAMNEV
After this series, we bring down the average SPEC06int slowdown vs KVM
from 11.47x to 7.58x.
Tested-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <20190116170114.26802-4-cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2019-01-16 18:01:14 +01:00
|
|
|
#define TCG_TARGET_IMPLEMENTS_DYN_TLB 1
|
2014-04-01 17:34:03 +02:00
|
|
|
|
2013-08-20 23:41:29 +02:00
|
|
|
#ifdef __x86_64__
|
|
|
|
# define TCG_TARGET_REG_BITS 64
|
2017-08-17 23:47:43 +02:00
|
|
|
# define TCG_TARGET_NB_REGS 32
|
2010-06-04 02:35:17 +02:00
|
|
|
#else
|
2013-08-20 23:41:29 +02:00
|
|
|
# define TCG_TARGET_REG_BITS 32
|
2017-08-17 23:47:43 +02:00
|
|
|
# define TCG_TARGET_NB_REGS 24
|
2010-06-04 02:35:17 +02:00
|
|
|
#endif
|
2008-02-01 11:05:41 +01:00
|
|
|
|
2011-11-09 09:03:33 +01:00
|
|
|
typedef enum {
|
2008-02-01 11:05:41 +01:00
|
|
|
TCG_REG_EAX = 0,
|
|
|
|
TCG_REG_ECX,
|
|
|
|
TCG_REG_EDX,
|
|
|
|
TCG_REG_EBX,
|
|
|
|
TCG_REG_ESP,
|
|
|
|
TCG_REG_EBP,
|
|
|
|
TCG_REG_ESI,
|
|
|
|
TCG_REG_EDI,
|
2010-06-04 02:35:17 +02:00
|
|
|
|
|
|
|
/* 64-bit registers; always define the symbols to avoid
|
|
|
|
too much if-deffing. */
|
|
|
|
TCG_REG_R8,
|
|
|
|
TCG_REG_R9,
|
|
|
|
TCG_REG_R10,
|
|
|
|
TCG_REG_R11,
|
|
|
|
TCG_REG_R12,
|
|
|
|
TCG_REG_R13,
|
|
|
|
TCG_REG_R14,
|
|
|
|
TCG_REG_R15,
|
2017-08-17 23:47:43 +02:00
|
|
|
|
|
|
|
TCG_REG_XMM0,
|
|
|
|
TCG_REG_XMM1,
|
|
|
|
TCG_REG_XMM2,
|
|
|
|
TCG_REG_XMM3,
|
|
|
|
TCG_REG_XMM4,
|
|
|
|
TCG_REG_XMM5,
|
|
|
|
TCG_REG_XMM6,
|
|
|
|
TCG_REG_XMM7,
|
|
|
|
|
|
|
|
/* 64-bit registers; likewise always define. */
|
|
|
|
TCG_REG_XMM8,
|
|
|
|
TCG_REG_XMM9,
|
|
|
|
TCG_REG_XMM10,
|
|
|
|
TCG_REG_XMM11,
|
|
|
|
TCG_REG_XMM12,
|
|
|
|
TCG_REG_XMM13,
|
|
|
|
TCG_REG_XMM14,
|
|
|
|
TCG_REG_XMM15,
|
|
|
|
|
2010-06-04 02:35:17 +02:00
|
|
|
TCG_REG_RAX = TCG_REG_EAX,
|
|
|
|
TCG_REG_RCX = TCG_REG_ECX,
|
|
|
|
TCG_REG_RDX = TCG_REG_EDX,
|
|
|
|
TCG_REG_RBX = TCG_REG_EBX,
|
|
|
|
TCG_REG_RSP = TCG_REG_ESP,
|
|
|
|
TCG_REG_RBP = TCG_REG_EBP,
|
|
|
|
TCG_REG_RSI = TCG_REG_ESI,
|
|
|
|
TCG_REG_RDI = TCG_REG_EDI,
|
2018-10-30 22:52:44 +01:00
|
|
|
|
|
|
|
TCG_AREG0 = TCG_REG_EBP,
|
2018-10-30 22:55:43 +01:00
|
|
|
TCG_REG_CALL_STACK = TCG_REG_ESP
|
2011-11-09 09:03:33 +01:00
|
|
|
} TCGReg;
|
2008-02-01 11:05:41 +01:00
|
|
|
|
|
|
|
/* used for function call generation */
|
|
|
|
#define TCG_TARGET_STACK_ALIGN 16
|
2012-09-13 19:37:43 +02:00
|
|
|
#if defined(_WIN64)
|
|
|
|
#define TCG_TARGET_CALL_STACK_OFFSET 32
|
|
|
|
#else
|
2008-05-22 16:59:57 +02:00
|
|
|
#define TCG_TARGET_CALL_STACK_OFFSET 0
|
2012-09-13 19:37:43 +02:00
|
|
|
#endif
|
2008-02-01 11:05:41 +01:00
|
|
|
|
2014-01-28 06:49:17 +01:00
|
|
|
extern bool have_bmi1;
|
2016-11-22 14:15:04 +01:00
|
|
|
extern bool have_popcnt;
|
2017-08-17 23:47:43 +02:00
|
|
|
extern bool have_avx1;
|
|
|
|
extern bool have_avx2;
|
2014-01-28 06:49:17 +01:00
|
|
|
|
2009-03-10 20:37:46 +01:00
|
|
|
/* optional instructions */
|
2011-08-17 23:11:46 +02:00
|
|
|
#define TCG_TARGET_HAS_div2_i32 1
|
|
|
|
#define TCG_TARGET_HAS_rot_i32 1
|
|
|
|
#define TCG_TARGET_HAS_ext8s_i32 1
|
|
|
|
#define TCG_TARGET_HAS_ext16s_i32 1
|
|
|
|
#define TCG_TARGET_HAS_ext8u_i32 1
|
|
|
|
#define TCG_TARGET_HAS_ext16u_i32 1
|
|
|
|
#define TCG_TARGET_HAS_bswap16_i32 1
|
|
|
|
#define TCG_TARGET_HAS_bswap32_i32 1
|
|
|
|
#define TCG_TARGET_HAS_neg_i32 1
|
|
|
|
#define TCG_TARGET_HAS_not_i32 1
|
2014-01-28 06:49:17 +01:00
|
|
|
#define TCG_TARGET_HAS_andc_i32 have_bmi1
|
2011-08-17 23:11:46 +02:00
|
|
|
#define TCG_TARGET_HAS_orc_i32 0
|
|
|
|
#define TCG_TARGET_HAS_eqv_i32 0
|
|
|
|
#define TCG_TARGET_HAS_nand_i32 0
|
|
|
|
#define TCG_TARGET_HAS_nor_i32 0
|
2016-11-16 12:22:54 +01:00
|
|
|
#define TCG_TARGET_HAS_clz_i32 1
|
|
|
|
#define TCG_TARGET_HAS_ctz_i32 1
|
2016-11-22 14:15:04 +01:00
|
|
|
#define TCG_TARGET_HAS_ctpop_i32 have_popcnt
|
2011-09-29 18:52:11 +02:00
|
|
|
#define TCG_TARGET_HAS_deposit_i32 1
|
2016-10-14 21:08:13 +02:00
|
|
|
#define TCG_TARGET_HAS_extract_i32 1
|
|
|
|
#define TCG_TARGET_HAS_sextract_i32 1
|
2012-09-21 19:13:36 +02:00
|
|
|
#define TCG_TARGET_HAS_movcond_i32 1
|
2013-02-20 08:51:50 +01:00
|
|
|
#define TCG_TARGET_HAS_add2_i32 1
|
|
|
|
#define TCG_TARGET_HAS_sub2_i32 1
|
|
|
|
#define TCG_TARGET_HAS_mulu2_i32 1
|
2013-02-20 08:51:57 +01:00
|
|
|
#define TCG_TARGET_HAS_muls2_i32 1
|
2013-08-14 23:35:56 +02:00
|
|
|
#define TCG_TARGET_HAS_muluh_i32 0
|
|
|
|
#define TCG_TARGET_HAS_mulsh_i32 0
|
2017-04-27 05:29:18 +02:00
|
|
|
#define TCG_TARGET_HAS_goto_ptr 1
|
2017-08-01 07:02:31 +02:00
|
|
|
#define TCG_TARGET_HAS_direct_jump 1
|
2009-03-10 20:37:46 +01:00
|
|
|
|
2010-06-04 02:35:17 +02:00
|
|
|
#if TCG_TARGET_REG_BITS == 64
|
2018-12-01 01:31:15 +01:00
|
|
|
/* Keep target addresses zero-extended in a register. */
|
|
|
|
#define TCG_TARGET_HAS_extrl_i64_i32 (TARGET_LONG_BITS == 32)
|
|
|
|
#define TCG_TARGET_HAS_extrh_i64_i32 (TARGET_LONG_BITS == 32)
|
2011-08-17 23:11:46 +02:00
|
|
|
#define TCG_TARGET_HAS_div2_i64 1
|
|
|
|
#define TCG_TARGET_HAS_rot_i64 1
|
|
|
|
#define TCG_TARGET_HAS_ext8s_i64 1
|
|
|
|
#define TCG_TARGET_HAS_ext16s_i64 1
|
|
|
|
#define TCG_TARGET_HAS_ext32s_i64 1
|
|
|
|
#define TCG_TARGET_HAS_ext8u_i64 1
|
|
|
|
#define TCG_TARGET_HAS_ext16u_i64 1
|
|
|
|
#define TCG_TARGET_HAS_ext32u_i64 1
|
|
|
|
#define TCG_TARGET_HAS_bswap16_i64 1
|
|
|
|
#define TCG_TARGET_HAS_bswap32_i64 1
|
|
|
|
#define TCG_TARGET_HAS_bswap64_i64 1
|
|
|
|
#define TCG_TARGET_HAS_neg_i64 1
|
|
|
|
#define TCG_TARGET_HAS_not_i64 1
|
2014-01-28 06:49:17 +01:00
|
|
|
#define TCG_TARGET_HAS_andc_i64 have_bmi1
|
2011-08-17 23:11:46 +02:00
|
|
|
#define TCG_TARGET_HAS_orc_i64 0
|
|
|
|
#define TCG_TARGET_HAS_eqv_i64 0
|
|
|
|
#define TCG_TARGET_HAS_nand_i64 0
|
|
|
|
#define TCG_TARGET_HAS_nor_i64 0
|
2016-11-16 12:22:54 +01:00
|
|
|
#define TCG_TARGET_HAS_clz_i64 1
|
|
|
|
#define TCG_TARGET_HAS_ctz_i64 1
|
2016-11-22 14:15:04 +01:00
|
|
|
#define TCG_TARGET_HAS_ctpop_i64 have_popcnt
|
2011-09-29 18:52:11 +02:00
|
|
|
#define TCG_TARGET_HAS_deposit_i64 1
|
2016-10-14 21:08:13 +02:00
|
|
|
#define TCG_TARGET_HAS_extract_i64 1
|
2016-10-14 19:04:32 +02:00
|
|
|
#define TCG_TARGET_HAS_sextract_i64 0
|
2012-09-21 19:13:36 +02:00
|
|
|
#define TCG_TARGET_HAS_movcond_i64 1
|
2013-02-20 08:51:57 +01:00
|
|
|
#define TCG_TARGET_HAS_add2_i64 1
|
|
|
|
#define TCG_TARGET_HAS_sub2_i64 1
|
|
|
|
#define TCG_TARGET_HAS_mulu2_i64 1
|
|
|
|
#define TCG_TARGET_HAS_muls2_i64 1
|
2013-08-14 23:35:56 +02:00
|
|
|
#define TCG_TARGET_HAS_muluh_i64 0
|
|
|
|
#define TCG_TARGET_HAS_mulsh_i64 0
|
2010-06-04 02:35:17 +02:00
|
|
|
#endif
|
|
|
|
|
2017-08-17 23:47:43 +02:00
|
|
|
/* We do not support older SSE systems, only beginning with AVX1. */
|
|
|
|
#define TCG_TARGET_HAS_v64 have_avx1
|
|
|
|
#define TCG_TARGET_HAS_v128 have_avx1
|
|
|
|
#define TCG_TARGET_HAS_v256 have_avx2
|
|
|
|
|
|
|
|
#define TCG_TARGET_HAS_andc_vec 1
|
|
|
|
#define TCG_TARGET_HAS_orc_vec 0
|
|
|
|
#define TCG_TARGET_HAS_not_vec 0
|
|
|
|
#define TCG_TARGET_HAS_neg_vec 0
|
|
|
|
#define TCG_TARGET_HAS_shi_vec 1
|
|
|
|
#define TCG_TARGET_HAS_shs_vec 0
|
|
|
|
#define TCG_TARGET_HAS_shv_vec 0
|
|
|
|
#define TCG_TARGET_HAS_cmp_vec 1
|
|
|
|
#define TCG_TARGET_HAS_mul_vec 1
|
2018-12-18 04:00:41 +01:00
|
|
|
#define TCG_TARGET_HAS_sat_vec 1
|
2018-12-18 05:17:56 +01:00
|
|
|
#define TCG_TARGET_HAS_minmax_vec 1
|
2017-08-17 23:47:43 +02:00
|
|
|
|
2011-09-29 18:52:11 +02:00
|
|
|
#define TCG_TARGET_deposit_i32_valid(ofs, len) \
|
|
|
|
(((ofs) == 0 && (len) == 8) || ((ofs) == 8 && (len) == 8) || \
|
|
|
|
((ofs) == 0 && (len) == 16))
|
|
|
|
#define TCG_TARGET_deposit_i64_valid TCG_TARGET_deposit_i32_valid
|
|
|
|
|
2016-10-14 21:08:13 +02:00
|
|
|
/* Check for the possibility of high-byte extraction and, for 64-bit,
|
|
|
|
zero-extending 32-bit right-shift. */
|
|
|
|
#define TCG_TARGET_extract_i32_valid(ofs, len) ((ofs) == 8 && (len) == 8)
|
|
|
|
#define TCG_TARGET_extract_i64_valid(ofs, len) \
|
|
|
|
(((ofs) == 8 && (len) == 8) || ((ofs) + (len)) == 32)
|
|
|
|
|
2013-08-20 23:22:50 +02:00
|
|
|
static inline void flush_icache_range(uintptr_t start, uintptr_t stop)
|
2008-02-01 11:05:41 +01:00
|
|
|
{
|
|
|
|
}
|
2012-12-06 12:15:58 +01:00
|
|
|
|
2017-08-01 07:02:31 +02:00
|
|
|
static inline void tb_target_set_jmp_target(uintptr_t tc_ptr,
|
|
|
|
uintptr_t jmp_addr, uintptr_t addr)
|
|
|
|
{
|
|
|
|
/* patch the branch destination */
|
|
|
|
atomic_set((int32_t *)jmp_addr, addr - (jmp_addr + 4));
|
|
|
|
/* no need to flush icache explicitly */
|
|
|
|
}
|
|
|
|
|
2017-02-23 19:29:27 +01:00
|
|
|
/* This defines the natural memory order supported by this
|
|
|
|
* architecture before guarantees made by various barrier
|
|
|
|
* instructions.
|
|
|
|
*
|
|
|
|
* The x86 has a pretty strong memory ordering which only really
|
|
|
|
* allows for some stores to be re-ordered after loads.
|
|
|
|
*/
|
|
|
|
#include "tcg-mo.h"
|
|
|
|
|
|
|
|
#define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD)
|
|
|
|
|
2018-11-20 08:37:42 +01:00
|
|
|
#define TCG_TARGET_HAS_MEMORY_BSWAP 1
|
|
|
|
|
2017-07-30 21:30:41 +02:00
|
|
|
#ifdef CONFIG_SOFTMMU
|
|
|
|
#define TCG_TARGET_NEED_LDST_LABELS
|
|
|
|
#endif
|
2017-07-21 07:56:42 +02:00
|
|
|
#define TCG_TARGET_NEED_POOL_LABELS
|
2017-07-30 21:30:41 +02:00
|
|
|
|
2012-12-06 12:15:58 +01:00
|
|
|
#endif
|