tcg-ppc64: Streamline tcg_out_tlb_read

Less conditional compilation.  Merge an add insn with the indexed
memory load insn.  Load the tlb addend earlier.  Avoid the address
update memory form.

Fix a bug in not allowing large enough tlb offsets for some guests.

Signed-off-by: Richard Henderson <rth@twiddle.net>
This commit is contained in:
Richard Henderson 2013-07-31 15:11:44 -07:00
parent fa94c3be7a
commit fedee3e7fd

View file

@ -31,13 +31,11 @@
static uint8_t *tb_ret_addr;
#define FAST_PATH
#if TARGET_LONG_BITS == 32
#define LD_ADDR LWZU
#define LD_ADDR LWZ
#define CMP_L 0
#else
#define LD_ADDR LDU
#define LD_ADDR LD
#define CMP_L (1<<21)
#endif
@ -816,38 +814,78 @@ static const void * const qemu_st_helpers[4] = {
helper_stq_mmu,
};
static void tcg_out_tlb_read(TCGContext *s, TCGReg r0, TCGReg r1, TCGReg r2,
TCGReg addr_reg, int s_bits, int offset)
/* Perform the TLB load and compare. Places the result of the comparison
in CR7, loads the addend of the TLB into R3, and returns the register
containing the guest address (zero-extended into R4). Clobbers R0 and R2. */
static TCGReg tcg_out_tlb_read(TCGContext *s, int s_bits, TCGReg addr_reg,
int mem_index, bool is_read)
{
#if TARGET_LONG_BITS == 32
tcg_out_ext32u(s, addr_reg, addr_reg);
int cmp_off
= (is_read
? offsetof(CPUArchState, tlb_table[mem_index][0].addr_read)
: offsetof(CPUArchState, tlb_table[mem_index][0].addr_write));
int add_off = offsetof(CPUArchState, tlb_table[mem_index][0].addend);
TCGReg base = TCG_AREG0;
tcg_out_rlw(s, RLWINM, r0, addr_reg,
32 - (TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS),
32 - (CPU_TLB_BITS + CPU_TLB_ENTRY_BITS),
31 - CPU_TLB_ENTRY_BITS);
tcg_out32(s, ADD | TAB(r0, r0, TCG_AREG0));
tcg_out32(s, LWZU | TAI(r1, r0, offset));
tcg_out_rlw(s, RLWINM, r2, addr_reg, 0,
(32 - s_bits) & 31, 31 - TARGET_PAGE_BITS);
#else
tcg_out_rld(s, RLDICL, r0, addr_reg,
64 - TARGET_PAGE_BITS,
64 - CPU_TLB_BITS);
tcg_out_shli64(s, r0, r0, CPU_TLB_ENTRY_BITS);
tcg_out32(s, ADD | TAB(r0, r0, TCG_AREG0));
tcg_out32(s, LD_ADDR | TAI(r1, r0, offset));
if (!s_bits) {
tcg_out_rld(s, RLDICR, r2, addr_reg, 0, 63 - TARGET_PAGE_BITS);
/* Extract the page index, shifted into place for tlb index. */
if (TARGET_LONG_BITS == 32) {
/* Zero-extend the address into a place helpful for further use. */
tcg_out_ext32u(s, TCG_REG_R4, addr_reg);
addr_reg = TCG_REG_R4;
} else {
tcg_out_rld(s, RLDICL, r2, addr_reg,
64 - TARGET_PAGE_BITS,
TARGET_PAGE_BITS - s_bits);
tcg_out_rld(s, RLDICL, r2, r2, TARGET_PAGE_BITS, 0);
tcg_out_rld(s, RLDICL, TCG_REG_R3, addr_reg,
64 - TARGET_PAGE_BITS, 64 - CPU_TLB_BITS);
}
#endif
/* Compensate for very large offsets. */
if (add_off >= 0x8000) {
/* Most target env are smaller than 32k; none are larger than 64k.
Simplify the logic here merely to offset by 0x7ff0, giving us a
range just shy of 64k. Check this assumption. */
QEMU_BUILD_BUG_ON(offsetof(CPUArchState,
tlb_table[NB_MMU_MODES - 1][1])
> 0x7ff0 + 0x7fff);
tcg_out32(s, ADDI | TAI(TCG_REG_R2, base, 0x7ff0));
base = TCG_REG_R2;
cmp_off -= 0x7ff0;
add_off -= 0x7ff0;
}
/* Extraction and shifting, part 2. */
if (TARGET_LONG_BITS == 32) {
tcg_out_rlw(s, RLWINM, TCG_REG_R3, addr_reg,
32 - (TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS),
32 - (CPU_TLB_BITS + CPU_TLB_ENTRY_BITS),
31 - CPU_TLB_ENTRY_BITS);
} else {
tcg_out_shli64(s, TCG_REG_R3, TCG_REG_R3, CPU_TLB_ENTRY_BITS);
}
tcg_out32(s, ADD | TAB(TCG_REG_R3, TCG_REG_R3, base));
/* Load the tlb comparator. */
tcg_out32(s, LD_ADDR | TAI(TCG_REG_R2, TCG_REG_R3, cmp_off));
/* Load the TLB addend for use on the fast path. Do this asap
to minimize any load use delay. */
tcg_out32(s, LD | TAI(TCG_REG_R3, TCG_REG_R3, add_off));
/* Clear the non-page, non-alignment bits from the address. */
if (TARGET_LONG_BITS == 32) {
tcg_out_rlw(s, RLWINM, TCG_REG_R0, addr_reg, 0,
(32 - s_bits) & 31, 31 - TARGET_PAGE_BITS);
} else if (!s_bits) {
tcg_out_rld(s, RLDICR, TCG_REG_R0, addr_reg, 0, 63 - TARGET_PAGE_BITS);
} else {
tcg_out_rld(s, RLDICL, TCG_REG_R0, addr_reg,
64 - TARGET_PAGE_BITS, TARGET_PAGE_BITS - s_bits);
tcg_out_rld(s, RLDICL, TCG_REG_R0, TCG_REG_R0, TARGET_PAGE_BITS, 0);
}
tcg_out32(s, CMP | BF(7) | RA(TCG_REG_R0) | RB(TCG_REG_R2) | CMP_L);
return addr_reg;
}
#endif
@ -875,10 +913,10 @@ static const uint32_t qemu_exts_opc[4] = {
static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
{
TCGReg addr_reg, data_reg, r0, r1, rbase;
TCGReg addr_reg, data_reg, rbase;
uint32_t insn, s_bits;
#ifdef CONFIG_SOFTMMU
TCGReg r2, ir;
TCGReg ir;
int mem_index;
void *label1_ptr, *label2_ptr;
#endif
@ -890,20 +928,10 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
#ifdef CONFIG_SOFTMMU
mem_index = *args;
r0 = TCG_REG_R3;
r1 = TCG_REG_R4;
r2 = TCG_REG_R0;
rbase = 0;
tcg_out_tlb_read(s, r0, r1, r2, addr_reg, s_bits,
offsetof(CPUArchState, tlb_table[mem_index][0].addr_read));
tcg_out32(s, CMP | BF(7) | RA(r2) | RB(r1) | CMP_L);
addr_reg = tcg_out_tlb_read(s, s_bits, addr_reg, mem_index, true);
label1_ptr = s->code_ptr;
#ifdef FAST_PATH
tcg_out32(s, BC | BI(7, CR_EQ) | BO_COND_TRUE);
#endif
/* slow path */
ir = TCG_REG_R3;
@ -919,42 +947,33 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
} else if (data_reg != TCG_REG_R3) {
tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_R3);
}
label2_ptr = s->code_ptr;
tcg_out32(s, B);
/* label1: fast path */
#ifdef FAST_PATH
reloc_pc14(label1_ptr, (tcg_target_long)s->code_ptr);
#endif
/* r0 now contains &env->tlb_table[mem_index][index].addr_read */
tcg_out32(s, LD | TAI(r0, r0,
offsetof(CPUTLBEntry, addend)
- offsetof(CPUTLBEntry, addr_read)));
/* r0 = env->tlb_table[mem_index][index].addend */
tcg_out32(s, ADD | TAB(r0, r0, addr_reg));
/* r0 = env->tlb_table[mem_index][index].addend + addr */
rbase = TCG_REG_R3;
#else /* !CONFIG_SOFTMMU */
#if TARGET_LONG_BITS == 32
tcg_out_ext32u(s, addr_reg, addr_reg);
#endif
r0 = addr_reg;
r1 = TCG_REG_R3;
rbase = GUEST_BASE ? TCG_GUEST_BASE_REG : 0;
if (TARGET_LONG_BITS == 32) {
tcg_out_ext32u(s, TCG_REG_R2, addr_reg);
addr_reg = TCG_REG_R2;
}
#endif
insn = qemu_ldx_opc[opc];
if (!HAVE_ISA_2_06 && insn == LDBRX) {
tcg_out32(s, ADDI | TAI(r1, r0, 4));
tcg_out32(s, LWBRX | TAB(data_reg, rbase, r0));
tcg_out32(s, LWBRX | TAB( r1, rbase, r1));
tcg_out_rld(s, RLDIMI, data_reg, r1, 32, 0);
tcg_out32(s, ADDI | TAI(TCG_REG_R0, addr_reg, 4));
tcg_out32(s, LWBRX | TAB(data_reg, rbase, addr_reg));
tcg_out32(s, LWBRX | TAB(TCG_REG_R0, rbase, TCG_REG_R0));
tcg_out_rld(s, RLDIMI, data_reg, TCG_REG_R0, 32, 0);
} else if (insn) {
tcg_out32(s, insn | TAB(data_reg, rbase, r0));
tcg_out32(s, insn | TAB(data_reg, rbase, addr_reg));
} else {
insn = qemu_ldx_opc[s_bits];
tcg_out32(s, insn | TAB(data_reg, rbase, r0));
tcg_out32(s, insn | TAB(data_reg, rbase, addr_reg));
insn = qemu_exts_opc[s_bits];
tcg_out32(s, insn | RA(data_reg) | RS(data_reg));
}
@ -966,10 +985,10 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
{
TCGReg addr_reg, r0, r1, rbase, data_reg;
TCGReg addr_reg, rbase, data_reg;
uint32_t insn;
#ifdef CONFIG_SOFTMMU
TCGReg r2, ir;
TCGReg ir;
int mem_index;
void *label1_ptr, *label2_ptr;
#endif
@ -980,20 +999,10 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
#ifdef CONFIG_SOFTMMU
mem_index = *args;
r0 = TCG_REG_R3;
r1 = TCG_REG_R4;
r2 = TCG_REG_R0;
rbase = 0;
tcg_out_tlb_read(s, r0, r1, r2, addr_reg, opc,
offsetof(CPUArchState, tlb_table[mem_index][0].addr_write));
tcg_out32(s, CMP | BF(7) | RA(r2) | RB(r1) | CMP_L);
addr_reg = tcg_out_tlb_read(s, opc, addr_reg, mem_index, false);
label1_ptr = s->code_ptr;
#ifdef FAST_PATH
tcg_out32(s, BC | BI(7, CR_EQ) | BO_COND_TRUE);
#endif
/* slow path */
ir = TCG_REG_R3;
@ -1008,34 +1017,25 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
tcg_out32(s, B);
/* label1: fast path */
#ifdef FAST_PATH
reloc_pc14(label1_ptr, (tcg_target_long)s->code_ptr);
#endif
tcg_out32(s, LD | TAI(r0, r0,
offsetof(CPUTLBEntry, addend)
- offsetof(CPUTLBEntry, addr_write)));
/* r0 = env->tlb_table[mem_index][index].addend */
tcg_out32(s, ADD | TAB(r0, r0, addr_reg));
/* r0 = env->tlb_table[mem_index][index].addend + addr */
reloc_pc14(label1_ptr, (tcg_target_long) s->code_ptr);
rbase = TCG_REG_R3;
#else /* !CONFIG_SOFTMMU */
#if TARGET_LONG_BITS == 32
tcg_out_ext32u(s, addr_reg, addr_reg);
#endif
r1 = TCG_REG_R3;
r0 = addr_reg;
rbase = GUEST_BASE ? TCG_GUEST_BASE_REG : 0;
if (TARGET_LONG_BITS == 32) {
tcg_out_ext32u(s, TCG_REG_R2, addr_reg);
addr_reg = TCG_REG_R2;
}
#endif
insn = qemu_stx_opc[opc];
if (!HAVE_ISA_2_06 && insn == STDBRX) {
tcg_out32(s, STWBRX | SAB(data_reg, rbase, r0));
tcg_out32(s, ADDI | TAI(r1, r0, 4));
tcg_out32(s, STWBRX | SAB(data_reg, rbase, addr_reg));
tcg_out32(s, ADDI | TAI(TCG_REG_R2, addr_reg, 4));
tcg_out_shri64(s, TCG_REG_R0, data_reg, 32);
tcg_out32(s, STWBRX | SAB(TCG_REG_R0, rbase, r1));
tcg_out32(s, STWBRX | SAB(TCG_REG_R0, rbase, TCG_REG_R2));
} else {
tcg_out32(s, insn | SAB(data_reg, rbase, r0));
tcg_out32(s, insn | SAB(data_reg, rbase, addr_reg));
}
#ifdef CONFIG_SOFTMMU