qemu-patch-raspberry4/target-i386/mem_helper.c
Emilio G. Cota 37b995f6e7 target-i386: remove helper_lock()
It's been superseded by the atomic helpers.

The use of the atomic helpers provides a significant performance and scalability
improvement. Below is the result of running the atomic_add-test microbenchmark with:
 $ x86_64-linux-user/qemu-x86_64 tests/atomic_add-bench -o 5000000 -r $r -n $n
, where $n is the number of threads and $r is the allowed range for the additions.

The scenarios measured are:
- atomic: implements x86' ADDL with the atomic_add helper (i.e. this patchset)
- cmpxchg: implement x86' ADDL with a TCG loop using the cmpxchg helper
- master: before this patchset

Results sorted in ascending range, i.e. descending degree of contention.
Y axis is Throughput in Mops/s. Tests are run on an AMD machine with 64
Opteron 6376 cores.

                atomic_add-bench: 5000000 ops/thread, [0,1] range

  25 ++---------+----------+---------+----------+----------+----------+---++
     + atomic +-E--+       +         +          +          +          +    |
     |cmpxchg +-H--+                                                       |
  20 +Emaster +-N--+                                                      ++
     ||                                                                    |
     |++                                                                   |
     ||                                                                    |
  15 +++                                                                  ++
     |N|                                                                   |
     |+|                                                                   |
  10 ++|                                                                  ++
     |+|+                                                                  |
     | |    -+E+------        +++  ---+E+------+E+------+E+-----+E+------+E|
     |+E+E+- +++     +E+------+E+--                                        |
   5 ++|+                                                                 ++
     |+N+H+---                                 +++                         |
     ++++N+--+H++----+++   +  +++  --++H+------+H+------+H++----+H+---+--- |
   0 ++---------+-----H----+---H-----+----------+----------+----------+---H+
     0          10         20        30         40         50         60
                                Number of threads

                atomic_add-bench: 5000000 ops/thread, [0,2] range

  25 ++---------+----------+---------+----------+----------+----------+---++
     ++atomic +-E--+       +         +          +          +          +    |
     |cmpxchg +-H--+                                                       |
  20 ++master +-N--+                                                      ++
     |E|                                                                   |
     |++                                                                   |
     ||E                                                                   |
  15 ++|                                                                  ++
     |N||                                                                  |
     |+||                                   ---+E+------+E+-----+E+------+E|
  10 ++| |        ---+E+------+E+-----+E+---                    +++      +++
     ||H+E+--+E+--                                                         |
     |+++++                                                                |
     | ||                                                                  |
   5 ++|+H+--                                  +++                        ++
     |+N+    -                              ---+H+------+H+------          |
     +  +N+--+H++----+H+---+--+H+----++H+---    +          +    +H+---+--+H|
   0 ++---------+----------+---------+----------+----------+----------+---++
     0          10         20        30         40         50         60
                                Number of threads

                atomic_add-bench: 5000000 ops/thread, [0,8] range

  40 ++---------+----------+---------+----------+----------+----------+---++
     ++atomic +-E--+       +         +          +          +          +    |
  35 +cmpxchg +-H--+                                                      ++
     | master +-N--+               ---+E+------+E+------+E+-----+E+------+E|
  30 ++|                   ---+E+--   +++                                 ++
     | |            -+E+---                                                |
  25 ++E        ---- +++                                                  ++
     |+++++ -+E+                                                           |
  20 +E+ E-- +++                                                          ++
     |H|+++                                                                |
     |+|                                       +H+-------                  |
  15 ++H+                                   ---+++      +H+------         ++
     |N++H+--                         +++---                    +H+------++|
  10 ++ +++  -       +++           ---+H+                       +++      +H+
     | |     +H+-----+H+------+H+--                                        |
   5 ++|                      +++                                         ++
     ++N+N+--+N++          +         +          +          +          +    |
   0 ++---------+----------+---------+----------+----------+----------+---++
     0          10         20        30         40         50         60
                                Number of threads

               atomic_add-bench: 5000000 ops/thread, [0,128] range

  160 ++---------+---------+----------+---------+----------+----------+---++
      + atomic +-E--+      +          +         +          +          +    |
  140 +cmpxchg +-H--+                          +++      +++               ++
      | master +-N--+                           E--------E------+E+------++|
  120 ++                                      --|        |      +++       E+
      |                                     -- +++      +++              ++|
  100 ++                                   -                              ++
      |                                +++-                     +++      ++|
   80 ++                              -+E+    -+H+------+H+------H--------++
      |                           ----    ----                  +++       H|
      |            ---+E+-----+E+-  ---+H+                               ++|
   60 ++     +E+---   +++  ---+H+---                                      ++
      |    --+++   ---+H+--                                                |
   40 ++ +E+-+H+---                                                       ++
      |  +H+                                                               |
   20 +EE+                                                                ++
      +N+        +         +          +         +          +          +    |
    0 ++N-N---N--+---------+----------+---------+----------+----------+---++
      0          10        20         30        40         50         60
                                Number of threads

              atomic_add-bench: 5000000 ops/thread, [0,1024] range

  350 ++---------+---------+----------+---------+----------+----------+---++
      + atomic +-E--+      +          +         +          +          +    |
  300 +cmpxchg +-H--+                                                    +++
      | master +-N--+                                           +++       ||
      |                                                 +++      |    ----E|
  250 ++                                                 |   ----E----    ++
      |                                              ----E---    |    ---+H|
  200 ++                                      -+E+---   +++  ---+H+---    ++
      |                                   ----         -+H+--              |
      |                                +E+     +++ ---- +++                |
  150 ++                            ---+++  ---+H+-                       ++
      |                          ---  -+H+--                               |
  100 ++                   ---+E+ ---- +++                                ++
      |      +++   ---+E+-----+H+-                                         |
      |     -+E+------+H+--                                                |
   50 ++ +E+                                                              ++
      +EE+       +         +          +         +          +          +    |
    0 ++N-N---N--+---------+----------+---------+----------+----------+---++
      0          10        20         30        40         50         60
                                Number of threads

  hi-res: http://imgur.com/a/fMRmq

For master I stopped measuring master after 8 threads, because there is little
point in measuring the well-known performance collapse of a contended lock.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1467054136-10430-21-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
2016-10-26 08:29:01 -07:00

216 lines
5.9 KiB
C

/*
* x86 memory access helpers
*
* Copyright (c) 2003 Fabrice Bellard
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
*/
#include "qemu/osdep.h"
#include "cpu.h"
#include "exec/helper-proto.h"
#include "exec/exec-all.h"
#include "exec/cpu_ldst.h"
#include "qemu/int128.h"
#include "tcg.h"
void helper_cmpxchg8b_unlocked(CPUX86State *env, target_ulong a0)
{
uintptr_t ra = GETPC();
uint64_t oldv, cmpv, newv;
int eflags;
eflags = cpu_cc_compute_all(env, CC_OP);
cmpv = deposit64(env->regs[R_EAX], 32, 32, env->regs[R_EDX]);
newv = deposit64(env->regs[R_EBX], 32, 32, env->regs[R_ECX]);
oldv = cpu_ldq_data_ra(env, a0, ra);
newv = (cmpv == oldv ? newv : oldv);
/* always do the store */
cpu_stq_data_ra(env, a0, newv, ra);
if (oldv == cmpv) {
eflags |= CC_Z;
} else {
env->regs[R_EAX] = (uint32_t)oldv;
env->regs[R_EDX] = (uint32_t)(oldv >> 32);
eflags &= ~CC_Z;
}
CC_SRC = eflags;
}
void helper_cmpxchg8b(CPUX86State *env, target_ulong a0)
{
#ifdef CONFIG_ATOMIC64
uint64_t oldv, cmpv, newv;
int eflags;
eflags = cpu_cc_compute_all(env, CC_OP);
cmpv = deposit64(env->regs[R_EAX], 32, 32, env->regs[R_EDX]);
newv = deposit64(env->regs[R_EBX], 32, 32, env->regs[R_ECX]);
#ifdef CONFIG_USER_ONLY
{
uint64_t *haddr = g2h(a0);
cmpv = cpu_to_le64(cmpv);
newv = cpu_to_le64(newv);
oldv = atomic_cmpxchg__nocheck(haddr, cmpv, newv);
oldv = le64_to_cpu(oldv);
}
#else
{
uintptr_t ra = GETPC();
int mem_idx = cpu_mmu_index(env, false);
TCGMemOpIdx oi = make_memop_idx(MO_TEQ, mem_idx);
oldv = helper_atomic_cmpxchgq_le_mmu(env, a0, cmpv, newv, oi, ra);
}
#endif
if (oldv == cmpv) {
eflags |= CC_Z;
} else {
env->regs[R_EAX] = (uint32_t)oldv;
env->regs[R_EDX] = (uint32_t)(oldv >> 32);
eflags &= ~CC_Z;
}
CC_SRC = eflags;
#else
cpu_loop_exit_atomic(ENV_GET_CPU(env), GETPC());
#endif /* CONFIG_ATOMIC64 */
}
#ifdef TARGET_X86_64
void helper_cmpxchg16b_unlocked(CPUX86State *env, target_ulong a0)
{
uintptr_t ra = GETPC();
Int128 oldv, cmpv, newv;
uint64_t o0, o1;
int eflags;
bool success;
if ((a0 & 0xf) != 0) {
raise_exception_ra(env, EXCP0D_GPF, GETPC());
}
eflags = cpu_cc_compute_all(env, CC_OP);
cmpv = int128_make128(env->regs[R_EAX], env->regs[R_EDX]);
newv = int128_make128(env->regs[R_EBX], env->regs[R_ECX]);
o0 = cpu_ldq_data_ra(env, a0 + 0, ra);
o1 = cpu_ldq_data_ra(env, a0 + 8, ra);
oldv = int128_make128(o0, o1);
success = int128_eq(oldv, cmpv);
if (!success) {
newv = oldv;
}
cpu_stq_data_ra(env, a0 + 0, int128_getlo(newv), ra);
cpu_stq_data_ra(env, a0 + 8, int128_gethi(newv), ra);
if (success) {
eflags |= CC_Z;
} else {
env->regs[R_EAX] = int128_getlo(oldv);
env->regs[R_EDX] = int128_gethi(oldv);
eflags &= ~CC_Z;
}
CC_SRC = eflags;
}
void helper_cmpxchg16b(CPUX86State *env, target_ulong a0)
{
uintptr_t ra = GETPC();
if ((a0 & 0xf) != 0) {
raise_exception_ra(env, EXCP0D_GPF, ra);
} else {
#ifndef CONFIG_ATOMIC128
cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
#else
int eflags = cpu_cc_compute_all(env, CC_OP);
Int128 cmpv = int128_make128(env->regs[R_EAX], env->regs[R_EDX]);
Int128 newv = int128_make128(env->regs[R_EBX], env->regs[R_ECX]);
int mem_idx = cpu_mmu_index(env, false);
TCGMemOpIdx oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
Int128 oldv = helper_atomic_cmpxchgo_le_mmu(env, a0, cmpv,
newv, oi, ra);
if (int128_eq(oldv, cmpv)) {
eflags |= CC_Z;
} else {
env->regs[R_EAX] = int128_getlo(oldv);
env->regs[R_EDX] = int128_gethi(oldv);
eflags &= ~CC_Z;
}
CC_SRC = eflags;
#endif
}
}
#endif
void helper_boundw(CPUX86State *env, target_ulong a0, int v)
{
int low, high;
low = cpu_ldsw_data_ra(env, a0, GETPC());
high = cpu_ldsw_data_ra(env, a0 + 2, GETPC());
v = (int16_t)v;
if (v < low || v > high) {
if (env->hflags & HF_MPX_EN_MASK) {
env->bndcs_regs.sts = 0;
}
raise_exception_ra(env, EXCP05_BOUND, GETPC());
}
}
void helper_boundl(CPUX86State *env, target_ulong a0, int v)
{
int low, high;
low = cpu_ldl_data_ra(env, a0, GETPC());
high = cpu_ldl_data_ra(env, a0 + 4, GETPC());
if (v < low || v > high) {
if (env->hflags & HF_MPX_EN_MASK) {
env->bndcs_regs.sts = 0;
}
raise_exception_ra(env, EXCP05_BOUND, GETPC());
}
}
#if !defined(CONFIG_USER_ONLY)
/* try to fill the TLB and return an exception if error. If retaddr is
* NULL, it means that the function was called in C code (i.e. not
* from generated code or from helper.c)
*/
/* XXX: fix it to restore all registers */
void tlb_fill(CPUState *cs, target_ulong addr, MMUAccessType access_type,
int mmu_idx, uintptr_t retaddr)
{
int ret;
ret = x86_cpu_handle_mmu_fault(cs, addr, access_type, mmu_idx);
if (ret) {
X86CPU *cpu = X86_CPU(cs);
CPUX86State *env = &cpu->env;
raise_exception_err_ra(env, cs->exception_index, env->error_code, retaddr);
}
}
#endif