Improve inlining in cputlb.c.

Fix vector abs fallback.
 Only set parallel_cpus for SMP.
 Add vector dupm for 256-bit elements.
 -----BEGIN PGP SIGNATURE-----
 
 iQFRBAABCgA7FiEEekgeeIaLTbaoWgXAZN846K9+IV8FAl9RYlEdHHJpY2hhcmQu
 aGVuZGVyc29uQGxpbmFyby5vcmcACgkQZN846K9+IV/A+ggAri5v3tr1JqjPGcE0
 6+yvUmhA9vyFH4QN1LS8abW9LWIhjLwiOQCqQWD6o5YDOs+YlHnr5JQCsHSmmhf/
 NlipyNrHXs2L0ClPFbvNyUnmTHFG5tjkedv3v4lTbAUfJP7/slVEiK1QzWrlyF2c
 61RLDvhNa/mWLZ+IbsCUKRDN05dc++XczBUghqGffWequR4oRegnDnJcNa1OvF3A
 DcH0U8dw4wG1yqW4NNJpQWrGOMfXTXJcHO9FSurrjH68TJxZqeMsjlw+VfN+D8a1
 AU3WpKhomtSjF6mgebBdOuy4lYC0UoZ+TTY/ycRXM47C9q1s6ccLO1FtGyj/Vo6b
 nxPOFQ==
 =j/GJ
 -----END PGP SIGNATURE-----

Merge remote-tracking branch 'remotes/rth/tags/pull-tcg-20200903' into staging

Improve inlining in cputlb.c.
Fix vector abs fallback.
Only set parallel_cpus for SMP.
Add vector dupm for 256-bit elements.

# gpg: Signature made Thu 03 Sep 2020 22:38:25 BST
# gpg:                using RSA key 7A481E78868B4DB6A85A05C064DF38E8AF7E215F
# gpg:                issuer "richard.henderson@linaro.org"
# gpg: Good signature from "Richard Henderson <richard.henderson@linaro.org>" [full]
# Primary key fingerprint: 7A48 1E78 868B 4DB6 A85A  05C0 64DF 38E8 AF7E 215F

* remotes/rth/tags/pull-tcg-20200903:
  tcg: Implement 256-bit dup for tcg_gen_gvec_dup_mem
  tcg: Eliminate one store for in-place 128-bit dup_mem
  softmmu/cpus: Only set parallel_cpus for SMP
  tcg: Fix tcg gen for vectorized absolute value
  cputlb: Make store_helper less fragile to compiler optimizations

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
Peter Maydell 2020-09-05 15:30:40 +01:00
commit 227de21ed0
3 changed files with 143 additions and 67 deletions

View file

@ -2009,6 +2009,80 @@ store_memop(void *haddr, uint64_t val, MemOp op)
}
}
static void __attribute__((noinline))
store_helper_unaligned(CPUArchState *env, target_ulong addr, uint64_t val,
uintptr_t retaddr, size_t size, uintptr_t mmu_idx,
bool big_endian)
{
const size_t tlb_off = offsetof(CPUTLBEntry, addr_write);
uintptr_t index, index2;
CPUTLBEntry *entry, *entry2;
target_ulong page2, tlb_addr, tlb_addr2;
TCGMemOpIdx oi;
size_t size2;
int i;
/*
* Ensure the second page is in the TLB. Note that the first page
* is already guaranteed to be filled, and that the second page
* cannot evict the first.
*/
page2 = (addr + size) & TARGET_PAGE_MASK;
size2 = (addr + size) & ~TARGET_PAGE_MASK;
index2 = tlb_index(env, mmu_idx, page2);
entry2 = tlb_entry(env, mmu_idx, page2);
tlb_addr2 = tlb_addr_write(entry2);
if (!tlb_hit_page(tlb_addr2, page2)) {
if (!victim_tlb_hit(env, mmu_idx, index2, tlb_off, page2)) {
tlb_fill(env_cpu(env), page2, size2, MMU_DATA_STORE,
mmu_idx, retaddr);
index2 = tlb_index(env, mmu_idx, page2);
entry2 = tlb_entry(env, mmu_idx, page2);
}
tlb_addr2 = tlb_addr_write(entry2);
}
index = tlb_index(env, mmu_idx, addr);
entry = tlb_entry(env, mmu_idx, addr);
tlb_addr = tlb_addr_write(entry);
/*
* Handle watchpoints. Since this may trap, all checks
* must happen before any store.
*/
if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
cpu_check_watchpoint(env_cpu(env), addr, size - size2,
env_tlb(env)->d[mmu_idx].iotlb[index].attrs,
BP_MEM_WRITE, retaddr);
}
if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) {
cpu_check_watchpoint(env_cpu(env), page2, size2,
env_tlb(env)->d[mmu_idx].iotlb[index2].attrs,
BP_MEM_WRITE, retaddr);
}
/*
* XXX: not efficient, but simple.
* This loop must go in the forward direction to avoid issues
* with self-modifying code in Windows 64-bit.
*/
oi = make_memop_idx(MO_UB, mmu_idx);
if (big_endian) {
for (i = 0; i < size; ++i) {
/* Big-endian extract. */
uint8_t val8 = val >> (((size - 1) * 8) - (i * 8));
helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
}
} else {
for (i = 0; i < size; ++i) {
/* Little-endian extract. */
uint8_t val8 = val >> (i * 8);
helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
}
}
}
static inline void QEMU_ALWAYS_INLINE
store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
TCGMemOpIdx oi, uintptr_t retaddr, MemOp op)
@ -2097,64 +2171,9 @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
if (size > 1
&& unlikely((addr & ~TARGET_PAGE_MASK) + size - 1
>= TARGET_PAGE_SIZE)) {
int i;
uintptr_t index2;
CPUTLBEntry *entry2;
target_ulong page2, tlb_addr2;
size_t size2;
do_unaligned_access:
/*
* Ensure the second page is in the TLB. Note that the first page
* is already guaranteed to be filled, and that the second page
* cannot evict the first.
*/
page2 = (addr + size) & TARGET_PAGE_MASK;
size2 = (addr + size) & ~TARGET_PAGE_MASK;
index2 = tlb_index(env, mmu_idx, page2);
entry2 = tlb_entry(env, mmu_idx, page2);
tlb_addr2 = tlb_addr_write(entry2);
if (!tlb_hit_page(tlb_addr2, page2)) {
if (!victim_tlb_hit(env, mmu_idx, index2, tlb_off, page2)) {
tlb_fill(env_cpu(env), page2, size2, MMU_DATA_STORE,
mmu_idx, retaddr);
index2 = tlb_index(env, mmu_idx, page2);
entry2 = tlb_entry(env, mmu_idx, page2);
}
tlb_addr2 = tlb_addr_write(entry2);
}
/*
* Handle watchpoints. Since this may trap, all checks
* must happen before any store.
*/
if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
cpu_check_watchpoint(env_cpu(env), addr, size - size2,
env_tlb(env)->d[mmu_idx].iotlb[index].attrs,
BP_MEM_WRITE, retaddr);
}
if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) {
cpu_check_watchpoint(env_cpu(env), page2, size2,
env_tlb(env)->d[mmu_idx].iotlb[index2].attrs,
BP_MEM_WRITE, retaddr);
}
/*
* XXX: not efficient, but simple.
* This loop must go in the forward direction to avoid issues
* with self-modifying code in Windows 64-bit.
*/
for (i = 0; i < size; ++i) {
uint8_t val8;
if (memop_big_endian(op)) {
/* Big-endian extract. */
val8 = val >> (((size - 1) * 8) - (i * 8));
} else {
/* Little-endian extract. */
val8 = val >> (i * 8);
}
helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
}
store_helper_unaligned(env, addr, val, retaddr, size,
mmu_idx, memop_big_endian(op));
return;
}
@ -2162,8 +2181,9 @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
store_memop(haddr, val, op);
}
void helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
TCGMemOpIdx oi, uintptr_t retaddr)
void __attribute__((noinline))
helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
TCGMemOpIdx oi, uintptr_t retaddr)
{
store_helper(env, addr, val, oi, retaddr, MO_UB);
}

View file

@ -1895,6 +1895,16 @@ static void qemu_tcg_init_vcpu(CPUState *cpu)
if (!tcg_region_inited) {
tcg_region_inited = 1;
tcg_region_init();
/*
* If MTTCG, and we will create multiple cpus,
* then we will have cpus running in parallel.
*/
if (qemu_tcg_mttcg_enabled()) {
MachineState *ms = MACHINE(qdev_get_machine());
if (ms->smp.max_cpus > 1) {
parallel_cpus = true;
}
}
}
if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
@ -1904,7 +1914,6 @@ static void qemu_tcg_init_vcpu(CPUState *cpu)
if (qemu_tcg_mttcg_enabled()) {
/* create a thread per vCPU with TCG (MTTCG) */
parallel_cpus = true;
snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
cpu->cpu_index);

View file

@ -1570,18 +1570,16 @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
tcg_temp_free_i64(in);
}
} else {
} else if (vece == 4) {
/* 128-bit duplicate. */
/* ??? Dup to 256-bit vector. */
int i;
tcg_debug_assert(vece == 4);
tcg_debug_assert(oprsz >= 16);
if (TCG_TARGET_HAS_v128) {
TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
tcg_gen_ld_vec(in, cpu_env, aofs);
for (i = 0; i < oprsz; i += 16) {
for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
tcg_gen_st_vec(in, cpu_env, dofs + i);
}
tcg_temp_free_vec(in);
@ -1591,7 +1589,7 @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
tcg_gen_ld_i64(in0, cpu_env, aofs);
tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
for (i = 0; i < oprsz; i += 16) {
for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
tcg_gen_st_i64(in0, cpu_env, dofs + i);
tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
}
@ -1601,6 +1599,54 @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
if (oprsz < maxsz) {
expand_clr(dofs + oprsz, maxsz - oprsz);
}
} else if (vece == 5) {
/* 256-bit duplicate. */
int i;
tcg_debug_assert(oprsz >= 32);
tcg_debug_assert(oprsz % 32 == 0);
if (TCG_TARGET_HAS_v256) {
TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256);
tcg_gen_ld_vec(in, cpu_env, aofs);
for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
tcg_gen_st_vec(in, cpu_env, dofs + i);
}
tcg_temp_free_vec(in);
} else if (TCG_TARGET_HAS_v128) {
TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128);
TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128);
tcg_gen_ld_vec(in0, cpu_env, aofs);
tcg_gen_ld_vec(in1, cpu_env, aofs + 16);
for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
tcg_gen_st_vec(in0, cpu_env, dofs + i);
tcg_gen_st_vec(in1, cpu_env, dofs + i + 16);
}
tcg_temp_free_vec(in0);
tcg_temp_free_vec(in1);
} else {
TCGv_i64 in[4];
int j;
for (j = 0; j < 4; ++j) {
in[j] = tcg_temp_new_i64();
tcg_gen_ld_i64(in[j], cpu_env, aofs + j * 8);
}
for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
for (j = 0; j < 4; ++j) {
tcg_gen_st_i64(in[j], cpu_env, dofs + i + j * 8);
}
}
for (j = 0; j < 4; ++j) {
tcg_temp_free_i64(in[j]);
}
}
if (oprsz < maxsz) {
expand_clr(dofs + oprsz, maxsz - oprsz);
}
} else {
g_assert_not_reached();
}
}
@ -2264,12 +2310,13 @@ static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
/*
* Invert (via xor -1) and add one (via sub -1).
* Invert (via xor -1) and add one.
* Because of the ordering the msb is cleared,
* so we never have carry into the next element.
*/
tcg_gen_xor_i64(d, b, t);
tcg_gen_sub_i64(d, d, t);
tcg_gen_andi_i64(t, t, dup_const(vece, 1));
tcg_gen_add_i64(d, d, t);
tcg_temp_free_i64(t);
}